def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== facet_type = 'INDUSTRY' industry = g['INDUSTRY'] industries_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in industry.split(','): # COMMA, OR OTHER industries_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for industry in industries_array: facet_desc = industry.upper().replace('-JOBS', '') facet_desc = facet_desc.replace(r'/', '') for i in range(10): print("iteration {0} ({1}) starting".format(i, facet_desc)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + r'/{}'.format(industry) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) #facet_count = re.search(r'JOBS 1 TO 10 OF(.*?)</DIV>', str(soup).encode("utf-8","ignore").decode('ascii', 'ignore').upper()).group(1) facet_count = re.search( r'PAGE 1 OF(.*?)JOBS</DIV>', str(soup).encode("utf-8", "ignore").decode( 'ascii', 'ignore').upper()).group(1) facet_count = int(facet_count.replace(',', '')) except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ===================================================================== for ul in soup.find_all('ul', class_='facet'): for li in ul.find_all('li'): # RETURN THE FACET TEXT (SECTION TITLE) facet = li.find( 'strong' ) # ASSUMES THE FIRST ROW OF THE FACET IS THE "TITLE" ROW - BREAKS IF IT ISNT if facet: facet_type = facet.text.upper() else: facet_type = facet_type.upper( ) # IF NONE IS FOUND, APPLY CURRENT FACET_TYPE VALUE TO NEXT FACET_TYPE VALUE facet_desc = li.find('a') if facet_desc: # CHECKS IF THERE IS A RESULT ON THE SEARCH FOR THE "A" ANCHOR (REMOVES THE TITLE OF THE SECTIONS BY DEFAULT - RETURNED ABOVE) facet_desc = facet_desc.text.upper() facet_desc = re.sub( r"[!@#$']", '', str(facet_desc)) # REMOVES SPECIAL CHARACTERS FROM STRING facet_count = li.find('span') facet_count = int(facet_count.text.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: # IF NO "A" ANCHOR IS FOUND, IGNORE None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - CATEGORY COUNT ===================================================================== for div in soup.find_all('div', id="category"): for div_sub in div.find_all('div'): txt = re.findall(r'SECTORS">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'JOBS">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - REGION COUNT ======================================================================= for div in soup.find_all('div', id="location"): for div_sub in div.find_all('div'): txt = re.findall(r'LOCATIONS">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 3 - JOB TYPE =========================================================================== for div in soup.find_all('div', id="subcategory"): for div_sub in div.find_all('div'): txt = re.findall(r'TYPES">(.+?)</A>', str(div_sub).upper()) # find any within brackets txt = str(txt[0]) facet_type = txt.upper() #print(facet_type) for li in div.find_all('li'): txt = re.findall(r'JOBS">(.+?)</A>', str(li).upper()) # find any within brackets txt = str(txt[0]) facet_desc = txt.upper().replace('&', '&') find_nbr = re.findall('\([0-9]*\)', str(li)) # find any within brackets if find_nbr: find_nbr = str(find_nbr[0]) facet_count = find_nbr.replace('(', '').replace(')', '') facet_count = int(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY & REGION COUNT ============================================================ for div in soup.find_all('div', id='sr-filters-hidden'): for links in div.find_all('a'): full_ref = str(links) full_ref = full_ref.upper() print(full_ref) if any(item in full_ref for item in g['ITEM_CHECK']): if r"'LOCATION'" in full_ref: facet_type = 'REGION' elif r"'CATEGORY'" in full_ref: facet_type = 'CATEGORY' elif r"'CUSTOMER'" in full_ref: facet_type = 'CUSTOMER' elif r"'POSITIONTYPE'" in full_ref: facet_type = 'JOB TYPE' elif r"'INDUSTRY'" in full_ref: facet_type = 'INDUSTRY' else: None for span in links.find_all('span'): try: int(span.text) facet_count = span.text facet_count = facet_count.replace(',', '') facet_count = int(facet_count) except: facet_desc = span.text.upper() facet_desc = facet_desc.replace(r"'", '') # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (captr_dte_id = {1} or captr_dte_id <= {2})""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] ) dbmgr.query(q) # ============================================================================= # LOOP THROUGH DATES FOR HISTORICAL SCRAPES (ONLY REQUIRED FOR FIRST RUN) # # ============================================================================= # dts = g['MONTH_LIST'] # ONLY NEEDED TO RUN HISTORY first_dy_curr_mth = fdttm.today().replace(day=1) dts = [] dts.append((first_dy_curr_mth - datetime.timedelta(days=1) ).strftime('%b.%Y').lower()) # PREVIOUS MONTH dts.append(time.strftime('%b.%Y').lower()) # CURRENT MONTH dts.append((datetime.date.today() + relativedelta.relativedelta(months=1) ).strftime('%b.%Y').lower()) # NEXT MONTH X1 dts.append((datetime.date.today() + relativedelta.relativedelta(months=2) ).strftime('%b.%Y').lower()) # NEXT MONTH X2 #dts.append( (datetime.date.today() + relativedelta.relativedelta(months=3)).strftime('%b.%Y').lower() ) # NEXT MONTH X3 -- CALENDAR DOESNT GO THIS FAR FORWARD #dts_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in dts: # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + item passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup)#.encode("utf-8")) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # GET MONTH AND YEAR FROM HEADER for div in soup.find_all('div', class_='head'): for span in div.find_all('span'): if '<strong>' in str(span).lower(): dt_part = span.text.upper().split(' ') annce_mth = dt_part[-2] annce_yr = dt_part[-1] for tab in soup.find_all('table', class_='calendar__table'): for row in tab.find_all('tr'): if 'calendar__row' in str(row).lower( ) and 'day-breaker' not in str(row).lower( ) and 'calendarexpanded__container' not in str(row).lower(): cell_nbr = 1 # INITIALISE CELL NBR TO 1 - DATA WILL BE ASSIGNED BASED ON CELL NBR (POSITION) WHICH SHOULDNT CHANGE for cell in row.find_all('td'): #print(cell) if cell_nbr == 1: # DATE OF MONTH (IF NOT NULL) try: dt = re.search('<span>(.*)</span>', str(cell).lower()) dt = dt.group(1).replace('</span>', '').upper() dt_part = dt.split(' ') mth_nme = dt_part[0] dy_nbr = dt_part[1] if len(dy_nbr) == 1: dy_nbr = '0' + str(dy_nbr) else: dy_nbr = str(dy_nbr) mth_nbr = g['MONTH_NBR_CNVRT'].get(mth_nme) annce_dt = str(annce_yr) + '-' + str( mth_nbr) + '-' + str(dy_nbr) msmt_dte_id = str(annce_yr) + str( mth_nbr) + str(dy_nbr) except: annce_dt = annce_dt elif cell_nbr == 2: # TIME OF DAY (MIGHT BE "ALL DAY" EVENT) if cell.text.strip().upper() != '': annce_tm = cell.text.strip().upper() else: try: annce_tm = annce_tm except: annce_tm = '' elif cell_nbr == 3: # CNTRY CDE try: cntry_cde = cell.text.strip().upper() except: cntry_cde = '' elif cell_nbr == 4: # IMPACT (LOW / MEDIUM / HIGH) result = cell.find('span') if result is not None: impact = result.get('title') impact = impact.upper().replace( 'IMPACT EXPECTED', '').strip() else: impact = '' elif cell_nbr == 5: # EVENT DESCRIPTION try: for span in cell.find_all('span'): event_desc = span.text.strip().upper() except: event_desc = '' elif cell_nbr == 6: # -- IGNORE -- LINK TO DETAILS pass elif cell_nbr == 7: # ACTUAL VALUE try: actual_val = cell.text.strip() except: actual_val = '' elif cell_nbr == 8: # FORECAST VALUE try: forecast_val = cell.text.strip() except: forecast_val = '' elif cell_nbr == 9: # PREVIOUS VALUE try: previous_val = cell.text.strip() except: previous_val = '' elif cell_nbr == 10: # -- IGNORE -- LINK TO GRAPH pass else: continue cell_nbr = cell_nbr + 1 # GENERATE A CODE FROM THE DESC AND CRNCY annce_cde = pyLIB.codeGen(cntry_cde + ' ' + event_desc) # GET CODE # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, SITE_CDE, ANNCE_DTE, ANNCE_TM, CNTRY_CDE, ANNCE_CDE, ANNCE_DESC, IMPACT, ACTUAL, FORECAST, PREVIOUS, CAPTR_DTE_ID, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', "{8}", '{9}', '{10}', '{11}', '{12}', {13}, '{14}', '{15}')""".format( g['TBL_NME'], #[0] msmt_dte_id, #[1] g['DATA_TYPE'], #[2] g['SITE_CDE'], #[3] annce_dt, #[4] annce_tm, #[5] cntry_cde, #[6] annce_cde, #[7] event_desc, #[8] impact, #[9] actual_val, #[10] forecast_val, #[11] previous_val, #[12] g['MSMT_DTE_ID'], #[13] g['STARTED_AT'], #[14] '' #[15] ) #print(q) dbmgr.query(q) # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE captr_dte_id = {2}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['MSMT_DTE_ID'] #[2] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] retention_date_id, # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'] # [4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") # print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== for div in soup.find_all('div', class_='inner cover'): chk_str = str(div).upper() chk_str = chk_str.replace(',', '') nbr = re.search('SEARCH.<B>(\d*)</B>', chk_str).group(1) facet_type = 'TOTAL' facet_desc = 'ALL JOBS' facet_count = int(nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] g['DATA_TYPE'], # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'], # [4] facet_type, # [5] facet_desc, # [6] facet_count, # [7] g['STARTED_AT'], # [8] '' # [9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_') + '_' + facet_desc.replace(' ','_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - ALL OTHER FACETS =================================================================== # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") # print(soup) for div in soup.find_all('div', class_="results-filter-content"): for section in div.find_all('section'): # FACET TYPE facet_type = section.find('h3') facet_type = facet_type.text.upper() facet_type = facet_type.replace('HIDE FILTERS', '').replace('DISPLAY FILTERS', '').replace('HELP - EDUCATION OR TRAINING', '').strip() if 'REGIONS' in facet_type: facet_type = 'REGIONS' elif 'CATEGORIES' in facet_type: facet_type = 'CATEGORY' else: facet_type = facet_type # print(facet_type) # FACET DESCRIPTION AND COUNT for li in section.find_all('li'): txt = li.text txt = txt.replace('\\', '~').replace('\n', '~').replace('\r', '~').replace('\t', '~').upper() txt = txt.replace('~', '').replace("'", "").strip() # print(txt) # FACET DESCRIPTION =========================================================== facet_desc = re.findall(' FOUND(.*)', str(txt)) facet_desc = cleanhtml(facet_desc[0]) #facet_desc = str(facet_desc[0]).strip() # print(facet_desc) # FACET COUNT ================================================================= facet_count = re.findall('(\d*)', txt) facet_count = str(facet_count[0]) facet_count = facet_count.replace(',', '') try: facet_count = int(facet_count) except: facet_count = 0 # print(facet_count) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], # [0] g['MSMT_DTE_ID'], # [1] g['DATA_TYPE'], # [2] g['CNTRY_CDE'], # [3] g['SITE_CDE'], # [4] facet_type, # [5] facet_desc, # [6] facet_count, # [7] g['STARTED_AT'], # [8] '' # [9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], # [0] finished_at, # [1] g['CNTRY_CDE'], # [2] g['MSMT_DTE_ID'] # [3] ) dbmgr.query(q)
def scrape(): # ============================================================================= # DELETE ANY COMPRESSED OR CSV FILES FOUND IN DATA DIRECTORY READY FOR RETRIEVAL # ============================================================================= # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECTS LIST OF TARGET LINKS (SUB PAGES) OF CCY PAIRS link_list = [] for table in soup.find_all('table'): for links in table.find_all('a'): link = str(links.get('href')) ccy_pair = link[ -6:] # GET THE CCY PAIR FROM THE LINK TO COMPARE TO ACCEPTED LIST if ccy_pair in str(g['CCY_PAIRS']): #14 pairs link_list.append(link) # LOOP THROUGH TARGET LINKS TO DETERMINE YEAR LINKS for link in link_list: # UPDATE g DICTIONARY SO WEBPAGE WILL NOT USE SELENIUM DRIVER g['USES_WEB_DRVR'] = 'N' # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # RETURN THE MOST RECENT YEAR LINK FROM TABLE year_list = [] for table in soup.find_all('table'): # FIRST LOOP - GET THE RELEVANT YEAR (IF MONTH IS JANUARY THEN RETURN PREVIOUS YEAR) for links in table.find_all('a'): link = str(links.get('href')) year = link[ -4:] # GET THE CCY PAIR FROM THE LINK TO COMPARE TO ACCEPTED LIST year = int(year) year_list.append(year) if int( g['MONTH_NBR'] ) == 1: # CHECKS IF CURRENT MONTH IS JANUARY - IF YES, WE WANT THE PREVIOUS YEAR LINK TO GET DECEMBER DATA year = max(year_list) - 1 else: year = max(year_list) # SECOND LOOP - GETS THE RELEVANT YEAR LINK for links in table.find_all('a'): curr_link = str(links.get('href')) if str(year) in curr_link: link = curr_link # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) month_list = [] for div in soup.find_all('div', class_='page-content'): # FIRST LOOP - GET THE RELEVANT MONTH (PART OF CURRENT MONTH IS POSTED - WE ONLY WANT THE FULL "PREVIOUS" MONTH) for links in div.find_all('a'): if 'CSV' in str(links).upper(): link = str(links.get('href')) month = link[ -2:] # GET THE MONTH NBR FROM THE LINK TO COMPARE TO ACCEPTED LIST month = int(month.replace(r'/', '')) month_list.append(month) # RETURN PREVIOUS MONTH NBR (CURRENT MONTH WILL ONLY BE A PART MONTH FILE - WE WANT THE LAST FULL MONTH if g['MONTH_NBR'] == '01': month = 12 else: month = max(month_list) - 1 if len(str(month)) == 1: month_str = '0' + str(month) else: month_str = str(month) month_long_text = list(g['MONTH_NBR_CNVRT'].keys())[list( g['MONTH_NBR_CNVRT'].values()).index(month_str)] # SECOND LOOP - GETS THE RELEVANT MONTH LINK for links in div.find_all('a'): curr_link = str(links.get('href')) if month_long_text in str(links).upper(): link = curr_link # CREATE A SEARCH STRING # USED TO SEARCH FOR THAT FILE AND IF DOWNLOAD HAS COMPLETED li = link.split('/') li = li[-2:] li = ''.join(li) fileSearchStr = str(li) + '.zip' # UPDATE g DICTIONARY SO WEBPAGE WILL USE SELENIUM DRIVER g['USES_WEB_DRVR'] = 'Y' # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link linkId = g['DOWNLOAD_ID'] dlLink = pyHTMLPass.htmlDownloadLink(url, fileSearchStr, linkId, **g) # ============================================================================= # MOVE DOWNLOADED FILES # CHROME WEBDRIVER DOESNT "CURRENTLY" ALLOW TO SET DOWNLOAD PATH # FILES ARE AUTO DOWNLOADED TO THE SYSTEM DOWNLOAD DIRECTORY # ============================================================================= pyLIB.moveFiles(g['DEFAULT_SYS_DOWNLOAD_PATH'], g['FILE_MOVE_DEST_PATH'], fileSearchStr)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECT ALL JOBS RELATED LINKS catLinks = [] for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) if '/JOBS-IN-' in full_ref.upper() and not ('PRIORITY' in full_ref.upper()): catLinks.append(link_txt) #print(catLinks) # PASS 1 - INDUSTRY COUNT ===================================================================== for link in catLinks: facet_type = 'INDUSTRY' for i in range(10): print("iteration {0} ({1}) starting".format(i, link)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) title_txt = soup.title.string.upper() idx = title_txt.find(' JOBS') facet_desc = title_txt[:idx] #print(facet_desc) for span in soup.find_all('span', id='SearchSummary'): for h1 in span.find_all('h1'): nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] #print(facet_count) except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - TOTAL COUNT ======================================================================== time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_').replace('/', '-') + '_' + facet_desc.replace( ' ', '_').replace('/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - REGION COUNT ======================================================================= facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: for i in range(10): print("iteration {0} ({1}) starting".format(i, region)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + g[ 'URL_PART2'] + '{}'.format(region.replace(' ', '-')) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_desc = str(region.upper()) nbr = re.search('COUNT">(.*?)</STRONG>', str(soup).upper()).group(0) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}".format( i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY DETAILS =================================================================== for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) if 'JOBS AVAILABLE IN' in full_ref.upper(): facet_type = 'INDUSTRY' facet_desc = links.string.upper() link_nbr = re.findall('\d+', full_ref) facet_count = ''.join(str(e) for e in link_nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # PASS 2 - REGIONAL DETAILS =================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+')) passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS soup = str(soup) # ========================================================================================================================================================== # SCRAPE SUB PART - START # ========================================================================================================================================================== facet_desc = str(region.upper()) facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) facet_count = facet_count.replace(',','').strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # ========================================================================================================================================================== # SCRAPE SUB PART - END # ========================================================================================================================================================== # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + '/browse' passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY DETAILS =================================================================== rndm_sleep = random.randint(rLow,rHigh) facet_type = 'INDUSTRY' link_txt_array = [] for href in soup.find_all('a'): if '/BROWSE/' in str(href).upper() and '-JOBS' in str(href).upper(): # and 'LINK-DEFAULT' not in str(href).upper(): full_ref = str(href) link_txt = str(href.get('href')) if link_txt.count('/') < 5: link_txt_array.append(link_txt.replace('/browse','')) #print(link_txt_array) for link_txt in link_txt_array: # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = link_txt passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): facet_desc = str(h1.text).upper().replace('BROWSE','').replace('IN AUSTRALIA','').strip() for span in soup.find_all('span', class_='c'): facet_count = str(span.text) facet_count = int(facet_count.replace(',','')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - REGIONAL DETAILS =================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep(rndm_sleep) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] + '{}'.format(region.replace(' ','+')) passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS soup = str(soup) # ========================================================================================================================================================== # SCRAPE SUB PART - START # ========================================================================================================================================================== facet_desc = str(region.upper()) facet_count = re.search('1-10 of(.*?)</p>', soup).group(1) facet_count = facet_count.replace(',','').strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace(' ','_').replace('/','-') + '_' + facet_desc.replace(' ','_').replace('/','-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(passedHTML)) f.close() else: None # ========================================================================================================================================================== # SCRAPE SUB PART - END # ========================================================================================================================================================== # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY AND REGION COUNTS ========================================================= for div in soup.find_all('div', class_='srp-filter-panel--inner'): for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) match_pattern = re.search(r'\((.*?)\)', full_ref) if 'SRP-LIST-FILTER__ITEM' in str(links).upper( ) and match_pattern is not None and '/S-JOBS/C9302' not in str( links).upper(): #and 'AD=OFFERING' in str(links).upper() if '/S-JOBS/ACT' in str(links).upper() or '/S-JOBS/NSW' in str( links).upper() or '/S-JOBS/NT' in str(links).upper( ) or '/S-JOBS/QLD' in str( links).upper() or '/S-JOBS/SA' in str(links).upper( ) or '/S-JOBS/TAS' in str(links).upper( ) or '/S-JOBS/VIC' in str(links).upper( ) or '/S-JOBS/WA' in str(links).upper(): facet_type = 'REGION' elif '/S-JOBS/JOBTYPE' in str(links).upper(): facet_type = 'JOBTYPE' elif '/S-JOBS/ADVERTISEDBY' in str(links).upper(): facet_type = 'ADVERTISED' else: facet_type = 'INDUSTRY' #<a class="srp-list-filter__item-link link link--no-underline" href="/s-trades-services/c22340?ad=offering">Trades & Services (7,955)</a> try: objText = re.search(r'">(.*?)</a>', str(full_ref)).group(1) facet_desc = objText.upper().replace('&', '&') facet_desc = re.sub(r'\((.*?)\)', "", facet_desc) facet_desc = re.sub('[^A-Za-z0-9&' ' ]', '', facet_desc) facet_desc = facet_desc.strip() facet_count = re.search(r'\((.*?)\)', str(links)).group(1) facet_count = int(facet_count.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) #print(q) dbmgr.query(q) except ValueError: pass # it was a string, not an int. # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # ============================================================================= # DELETE ANY COMPRESSED OR CSV FILES FOUND IN DATA DIRECTORY READY FOR RETRIEVAL # ============================================================================= # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECTS LIST OF TARGET LINKS (SUB PAGES) OF CCY PAIRS link_list = [] for table in soup.find_all('table'): for links in table.find_all('a'): link = str(links.get('href')) ccy_pair = link[-6:] # GET THE CCY PAIR FROM THE LINK TO COMPARE TO ACCEPTED LIST if ccy_pair in str(g['CCY_PAIRS']): #14 pairs link_list.append( link ); # LOOP THROUGH TARGET LINKS TO DETERMINE YEAR LINKS for link in link_list: # UPDATE g DICTIONARY SO WEBPAGE WILL NOT USE SELENIUM DRIVER g['USES_WEB_DRVR'] = 'N' # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # RETURN THE MOST CURRENT/VALID MONTH & YEAR dte_list = [] for div in soup.find_all('div',class_='page-content'): # FIRST LOOP - GET THE RELEVANT MONTH (PART OF CURRENT MONTH IS POSTED - WE ONLY WANT THE FULL "PREVIOUS" MONTH) for links in div.find_all('a'): if 'METASTOCK' in str(links).upper() and 'FULL MONTH DATA' in str(links).upper(): link = str(links.get('href')) yr_mth = link[-7:] # GET THE YEAR AND MONTH PARTS (MAY INCLUDE EXTRA / DEPENDING ON LENGTH (EG. MONTHS WITH SINGLE DIGIT if yr_mth[:1] == r'/': month = yr_mth[-1:] # GET THE MONTH NBR FROM THE LINK TO COMPARE TO ACCEPTED LIST month = '0' + str(month) year = yr_mth[:5] # GET THE YEAR FROM THE LINK TO COMPARE TO ACCEPTED LIST year = str(year.replace(r'/','')) else: month = yr_mth[-2:] # GET THE MONTH NBR FROM THE LINK TO COMPARE TO ACCEPTED LIST year = yr_mth[:4] # GET THE YEAR FROM THE LINK TO COMPARE TO ACCEPTED LIST year = str(year.replace(r'/','')) dte = year + month dte_list.append(int(dte)) # RETURN PREVIOUS MONTH NBR (CURRENT MONTH WILL ONLY BE A PART MONTH FILE - WE WANT THE LAST FULL MONTH) intDte = max(dte_list) - 1 full_yr_flag = 'N' # NEED TO CHECK IF WE NEED FULL YEAR OR MONTH FILE if str(intDte)[-2:] == '00': # WE ARE IN JAN - NEED TO GET FULL YEAR RESULTS full_yr_flag = 'Y' yr = int(str(intDte)[:4]) - 1 yr = str(yr) else: full_yr_flag = 'N' mth = str(intDte)[-2:] month_long_text = list(g['MONTH_NBR_CNVRT'].keys())[list(g['MONTH_NBR_CNVRT'].values()).index(mth)] yr = str(intDte)[:4] # SECOND LOOP - GETS THE RELEVANT MONTH/YEAR LINK for links in div.find_all('a'): curr_link = str(links.get('href')) if full_yr_flag == 'N': if month_long_text in str(links).upper() and yr in str(links).upper(): link = curr_link searchStr = str(yr) + str(mth) break if full_yr_flag == 'Y': if yr in str(links).upper(): link = curr_link searchStr = str(yr) break # CREATE A SEARCH STRING # USED TO SEARCH FOR THAT FILE AND IF DOWNLOAD HAS COMPLETED fileSearchStr = str(searchStr) + '.zip' # UPDATE g DICTIONARY SO WEBPAGE WILL USE SELENIUM DRIVER g['USES_WEB_DRVR'] = 'Y' # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link linkId = g['DOWNLOAD_ID'] dlLink = pyHTMLPass.htmlDownloadLink(url,fileSearchStr,linkId,**g) #print(url) #print(linkId) # ============================================================================= # MOVE DOWNLOADED FILES # CHROME WEBDRIVER DOESNT "CURRENTLY" ALLOW TO SET DOWNLOAD PATH # FILES ARE AUTO DOWNLOADED TO THE SYSTEM DOWNLOAD DIRECTORY # ============================================================================= pyLIB.moveFiles(g['DEFAULT_SYS_DOWNLOAD_PATH'], g['FILE_MOVE_DEST_PATH'], fileSearchStr)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # COLLECT ALL JOBS RELATED LINKS regionLinksList = [] industryLinksList = [] jobtypeLinksList = [] # PASS 1 - COLLECT LINKS FOR THE VARIOUS TYPES ==================================== for ul in soup.find_all('ul', class_='provinceList'): for links in ul.find_all('a'): link = str(links.get('href')) regionLinksList.append(link) #print(regionLinksList) for ul in soup.find_all('ul', class_='categoryList'): for links in ul.find_all('a'): link = str(links.get('href')) industryLinksList.append(link) #print(industryLinksList) #for ul in soup.find_all('ul', class_='studentsList'): # for links in ul.find_all('a'): # link = str(links.get('href')) # jobtypeLinksList.append(link) # PASS 2 - COLLECT REGION DATA ==================================================== facet_type = 'REGION' for link in regionLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): for links in h1.find_all('a'): facet_desc = links.text.upper().replace('JOBS', '').strip() link = str(links.get('href')) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") for div in soup.find_all('div', class_='ResultText'): #result-count for span in div.find_all('span', class_='ResultText-numTotal'): facet_count = span.text #re.search(r' of(.*?)</strong>',str(strong)).group(1) #facet_count = strong.text.upper() #facet_count = facet_count.split('OF',1)[1] facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - COLLECT INDUSTRY DATA ================================================== facet_type = 'INDUSTRY' for link in industryLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1'): for links in h1.find_all('a'): facet_desc = links.text.upper().replace('JOBS', '').strip() link = str(links.get('href')) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") for div in soup.find_all('div', class_='ResultText'): for span in div.find_all('span', class_='ResultText-numTotal'): facet_count = span.text #re.search(r' of(.*?)</strong>',str(strong)).group(1) facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 4 - COLLECT JOBTYPE DATA =================================================== facet_type = 'JOB TYPE' for link in jobtypeLinksList: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'].lower().replace('/jobsearch/browse-jobs/', link) #url = link #.replace(href_search_str, '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) for h1 in soup.find_all('h1', class_='sr-search-title'): facet_desc = h1.text.upper().replace('JOBS', '').strip() #print(facet_desc) for div in soup.find_all('div', class_='result-count'): for p in div.find_all('p'): facet_count = p.text.upper() facet_count = facet_count.split('OF', 1)[1] facet_count = facet_count.strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNTS ==================================================================== for links in soup.find_all('a'): full_ref = str(links) link_txt = str(links.get('href')) facet_type = 'INDUSTRY' facet_desc_ = links.string facet_desc = str(facet_desc_).upper() if '/JOBS/' in link_txt.upper() and link_txt.count('/') == 3: for i in range(10): print("iteration {0} ({1}) starting".format(i, facet_desc)) while True: try: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link_txt.upper().replace('/JOBS/', '') passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) soup = soup.encode( "utf-8") # CODE PAGE ERROR - CONVERTS soup = str(soup) facet_count = re.search(r'View all(.*?)jobs', soup).group(1) facet_count = facet_count.replace(',', '') try: facet_count = int(facet_count) facet_count = str(facet_count) except: facet_count = '0' except: e = sys.exc_info() print("iteration {0} ({1}) failed with error : {2}". format(i, facet_desc, e)) continue break # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) break else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace(' ', '_').replace( '/', '-') + '_' + facet_desc.replace(' ', '_').replace( '/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow,rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta(-int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url,**g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup.encode("utf-8","ignore").decode('ascii', 'ignore')) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== for div in soup.find_all('div', class_='jsCustomScrollContent'): # ================================================================================================= # JOBTYPE CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='JobType'): # return the section header (facet type) for each of the child elements for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # REMOVE THE CATCH-ALL IN THE LIST FROM THE INSERT STATEMENT if facet_desc == 'ANY': None else: # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ================================================================================================= # SALARY CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='Salary'): # RETURN THE SECTION HEADER (FACET TYPE) FOR EACH OF THE CHILD ELEMENTS for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ================================================================================================= # MARKETS CLASS ONLY # ================================================================================================= for div_child in div.find_all('div', id='Markets'): # return the section header (facet type) for each of the child elements for span in div_child.find_all('span'): facet_type = span.text.upper() if facet_type: facet_type = facet_type else: None for div_data in div_child.find_all('div', class_='refineitem'): facet = div_data.find_all('label') # FACET DESC (type desc) facet_desc = re.search('>(.*)</label', str(facet[0])) facet_desc = str(facet_desc.group(1)).upper() facet_desc = facet_desc.replace('&', '&') # FACET COUNT facet_count = re.search('>(.*)</label', str(facet[1])) if facet_count.group(1): facet_count = facet_count.group(1) else: facet_count = '0' # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime("%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY & REGION COUNT ============================================================ for div in soup.find_all('div', id='ajaxRefineSearch'): for ref in div.find_all('div', class_='refineItem'): refText = str(ref).upper() # FACET TYPE if 'LOCATION_STATE' in refText: facet_type = 'REGION' elif 'EMPLOYMENTTYPE' in refText: facet_type = 'JOB TYPE' elif 'COMPANYNAME' in refText: facet_type = 'COMPANY NAME' elif 'JOBCATEGORY' in refText: facet_type = 'INDUSTRY' elif 'LOCATION_COUNTRY' in refText: facet_type = 'LOCATION' elif 'SALARYTYPE' in refText: facet_type = 'SALARY ESTIMATE' # FACET DESCRIPTION for links in ref.find_all('a'): linkText = links.string.upper() facet_desc = linkText try: # IGNORES ENTRIES THAT HAVE NO nbr VAL # NUMBER VALUE nbr = re.search(r'\((\d+(?:\.\d+)?)\)', refText).group(1) nbr = str(nbr).replace(',', '') facet_count = nbr #facet_count = re.findall('\d+', nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace( ' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() except: None else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(str(soup).encode('ascii', 'ignore')) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 0 - TOTAL COUNT ======================================================================= facet_type = 'TOTAL JOBS' for div in soup.find_all('div', class_='counter'): i = 0 for span in div.find_all('span'): if i == 0: facet_desc = 'ALL JOBS' elif i == 1: facet_desc = 'ALL COMPANIES' else: facet_desc = 'NOT CATEGORISED' spanval = div.findAll('span')[i] txt1 = spanval.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) i = i + 1 # PASS 1 - REGION COUNT ====================================================================== for div in soup.find_all('div', id='locationTabContent'): # LOCATION/REGION facet_type = 'REGION' for li in div.find_all('li'): for a in li.find_all('a', class_='region', href=True): facet_desc = a.text.upper().replace('JOBS IN', '').strip() for span in li.find_all('span'): txt1 = span.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ==================================================================== for div in soup.find_all('div', id='sectorTabContent'): # LOCATION/REGION facet_type = 'INDUSTRY' for li in div.find_all('li'): for a in li.find_all('a', href=True): facet_desc = a.text.upper().strip() for span in li.find_all('span'): txt1 = span.text.replace(',', '') txt2 = re.findall(r'\d+', txt1) facet_count = txt2[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY COUNT ===================================================================== facet_type = 'INDUSTRY' for div in soup.find_all('div', class_="content-holder container"): for a in div.find_all('a'): for span in a.find_all('span'): if 'TITLE' in str(span).upper(): txt = span.text.upper().replace(r"'", '') elif 'COUNT' in str(span).upper(): nbr = re.findall('\d+', span.text) facet_desc = txt facet_count = int(str(nbr[0])) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('>(.*?)jobs</span>', str(soup)).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 2 - INDUSTRY COUNT ===================================================================== for links in soup.find_all('a'): link_txt = str(links.get('href')) link_nbr = re.findall('\d+', link_txt) link_nbr_ = ''.join(str(e) for e in link_nbr) if link_nbr_: nbr_chk = int(link_nbr_) else: nbr_chk = 0 if 'JOBS-IN-' in link_txt.upper(): facet_type = 'REGION' else: facet_type = 'INDUSTRY' # FINAL ASSIGNMENTS facet_desc = links.string.upper() facet_desc = re.sub( r"[!@#$']", '', str(facet_desc)) # removes special characters from string if 'JOBS-' in link_txt.upper() and nbr_chk <= g[ 'REGION_CHK_ID']: # if href matches what is considered relevant, do the following time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + link_txt passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) try: nbr = re.search(r'1 to(.*?)jobs', str(soup.encode("utf-8"))).group(1) facet_count = int(nbr.strip().replace( nbr.strip().rpartition(' ')[0], '')) except: facet_count = 0 # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + facet_desc.replace(' ', '_') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - TOTAL COUNT ======================================================================== facet_type = 'TOTAL' facet_desc = 'ALL JOBS' nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1) nbr = str(nbr).replace(',', '') nbr = re.findall('\d+', nbr) facet_count = nbr[0] # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # PASS 2 - INDUSTRY COUNT ===================================================================== for ul in soup.find_all('ul', class_='facet'): for li in ul.find_all('li'): # return the facet text (section title) facet = li.find( 'strong' ) # assumes the first row of the facet is the "title" row - breaks if it isnt if facet: facet_type = facet.text.upper() else: facet_type = facet_type.upper( ) # if None is found, apply current facet_type value to next facet_type value facet_desc = li.find('a') if facet_desc: # checks if there is a result on the search for the "a" anchor (removes the title of the sections by default - returned above) try: facet_desc = facet_desc.text.upper() facet_desc = re.sub(r"[!@#$']", '', str( facet_desc)) # removes special characters from string facet_count = li.find('span') facet_count = int(facet_count.text.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) except: pass else: # if no "a" anchor is found, ignore None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # PASS 3 - REGION COUNT ===================================================================== facet_type = 'REGION' regions = g['REGIONS'] regions_array = [] # CONVERTS LIST OBJECT TO ARRAY FOR LOOPING for item in regions.split(','): # COMMA, OR OTHER regions_array.append(item) # LOOP THROUGH ALL THE ITEM IN REGIONS for region in regions_array: time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = url + g['URL_PART2'] + '{}'.format(region.replace(' ', '+')) passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) facet_desc = str(region.upper()) facet_count = re.search(r'10</span> of <span>(.*?)</span>', str(soup.encode("utf-8"))).group(1) facet_count = int(facet_count.replace(',', '')) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace(' ', '_').replace( '/', '-') + '_' + facet_desc.replace(' ', '_').replace( '/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() else: None # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + g['URL_PART1'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY & REGION ========================================================== for div in soup.find_all('div', id="centre_col"): for a in div.find_all('a'): if r"/IN/" in str(a).upper(): facet_type = 'REGION' else: facet_type = 'INDUSTRY' dest_url1 = str(a.get('href')) time.sleep( rndm_sleep ) # INSERTS PAUSE TO ASSIST REFLECTING HUMAN INTERACTION ON WEBPAGE # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + dest_url1 passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter for h1 in soup.find_all('h1'): for a in h1.find_all('a'): dest_url2 = str(a.get('href')) txt = a.text.upper() facet_desc = txt.replace(r"'", '').replace('JOBS', '').strip() # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] + dest_url2 passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8") # CODE PAGE ERROR - CONVERTS #soup = str(soup) # may need to # repasses text back through beautifulsoup interpreter nbr = re.search( r'</SPAN> OF <SPAN>(.*?)</SPAN>', str(soup).encode("utf-8", "ignore").decode( 'ascii', 'ignore').upper()).group(1) nbr = str(nbr).replace(',', '') facet_count = int(nbr) # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g[ 'SITE_CDE'] + '_' + facet_type.replace( ' ', '_').replace( '/', '-') + '_' + facet_desc.replace( ' ', '_').replace('/', '-') + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)
def scrape(): # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN" rLow = int(g['LOOP_RNDM_SLEEP_LOW']) rHigh = int(g['LOOP_RNDM_SLEEP_HIGH']) rndm_sleep = random.randint(rLow, rHigh) # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE retention_date = datetime.date.today() + datetime.timedelta( -int(g['DATA_RETENTION_DAYS'])) retention_date_id = retention_date.strftime('%Y%m%d') # ============================================================================= # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] retention_date_id, #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'] #[4] ) dbmgr.query(q) # ============================================================================= # PASS URL TO RETURN HTML FROM SITE PAGE # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB # ============================================================================= url = g['URL'] passedHTML = pyHTMLPass.htmlPass(url, **g) soup = BeautifulSoup(passedHTML, "html.parser") #soup = soup.encode("utf-8","ignore").decode('ascii', 'ignore') #print(soup) # ========================================================================================================================================================== # SCRAPE PART - START # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE DB statements # ========================================================================================================================================================== # PASS 1 - INDUSTRY AND REGION COUNT ========================================================== for select in soup.find_all('select', id='cat1'): for option in select.find_all('option'): facet_type = 'INDUSTRY' nbr_chk = re.findall('\([0-9]*\)', str(option)) # FIND ANY WITHIN BRACKETS if nbr_chk: nbr = nbr_chk[0] nbr = str(nbr) facet_count = int(nbr.replace('(', '').replace(')', '')) txt = re.findall( r'">(.+?)</OPTION>', str(option).upper()) # find any within stated regex txt = txt[0] txt = str(txt) txt = re.sub('\d', '', txt) facet_desc = txt.replace('(', '').replace(')', '').strip() # ============================================================================= # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB # ============================================================================= dbmgr = pyDB(g['DB']) q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format( g['TBL_NME'], #[0] g['MSMT_DTE_ID'], #[1] g['DATA_TYPE'], #[2] g['CNTRY_CDE'], #[3] g['SITE_CDE'], #[4] facet_type, #[5] facet_desc, #[6] facet_count, #[7] g['STARTED_AT'], #[8] '' #[9] ) dbmgr.query(q) else: None # ============================================================================= # WRITE HTML PAGE TO FILE # ============================================================================= if g['WRITE_HTML_TO_FILE'] == 'Y': file_name = g['MSMT_DTE_ID'] + '_' + g[ 'CNTRY_CDE'] + '_' + g['SITE_CDE'] + '_' + facet_type.replace( ' ', '_') + '_' + 'SITE_LISTING' + '.html' with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name, 'w+', encoding='utf-8') as f: f.writelines(str(soup)) f.close() # ========================================================================================================================================================== # SCRAPE PART - END # - this should be the primary section of code that changes # - only other sections that "may" change are DELETE and UPDATE db statements # ========================================================================================================================================================== # ============================================================================= # UPDATE LOCAL DB WITH A FINISH TIME # ============================================================================= finished_at = time.strftime( "%Y-%m-%d %H:%M:%S") # capture a finish time to be entered into the db dbmgr = pyDB(g['DB']) q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format( g['TBL_NME'], #[0] finished_at, #[1] g['CNTRY_CDE'], #[2] g['MSMT_DTE_ID'] #[3] ) dbmgr.query(q)