def save_to_csv(): """Save test data to a csv file""" today = datetime.now().strftime('%Y%m%d%H%M%S') headers = ['FirstName', 'LastName'] data = [('Israel', 'Dryer'), ('Judy', 'Dryer'), ('Abigail', 'Dryer')] filenamepath = r'c:\temp\save_to_csv_test_' + today + '.csv' DataTools.save_to_csv(data, filenamepath, headers) # test the resulting file try: with open(filenamepath, newline='', encoding='utf-8') as f: reader = csv.reader(f) data = list(reader) except FileNotFoundError as e: print( "FAILED >> Save to CSV File >> No file found. The file did not save" ) return try: assert data[0][0] == 'FirstName' assert data[1][1] == 'Dryer' except AssertionError: print( "FAILED >> Save to CSV File >> The test data does not match the data on file" ) return print("SUCCESS >> Save to CSV File")
def run(self): self.extract_page_urls(None) for url in self.urls_to_scrape: self.extract_page_data(url) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" page_num = 1 while True: page = self.get_request(URL.format(page_num), out_format='json', headers=HEADERS) if not page['Data']: break self.extract_page_data(page['Data']) page_num += 1 if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = 'https://recruiting.ultipro.com/CHE1006CBH/JobBoard/8effb9c6-91dc-4fae-4091-71d162d6fafe/JobBoardView/LoadSearchResults' json_data = self.post_request(url, headers=HEADERS, data=PAYLOAD, out_format='json') cards = json_data['opportunities'] for card in cards: self.extract_card_data(card) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" self.extract_page_urls(None) for url in self.urls_to_scrape: soup = self.get_request(url, out_format='soup') page = parse_json_to_dict(soup) if page: record = self.extract_page_data(page) self.data_scraped.append(record + (url, )) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = 'https://careers.bakertilly.com/jobs/' self.create_webdriver( headless=False) # doesn't seem to work with headless self.driver.get(url) self.extract_page_data(None) self.driver.quit() if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" self.extract_page_urls(None) if not len(self.urls_to_scrape): print(f"{self.name} >> {len(self.data_scraped)} records") return for url in list(self.urls_to_scrape): try: print(url) card = self.get_request(url) self.extract_card_data(card) except: continue if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY)
def run(self): """Run the scraper""" url = "https://recruiting.adp.com/srccar/public/rest/1/1215551/search/" page_num = 1 self.create_session_with_cookies() # find total pages to scrape _, page_total = self.get_page_pagetotal(page_num, url) # scrape all pages while page_num < page_total: page, _ = self.get_page_pagetotal(page_num, url) self.extract_page_data(page) page_num += 1 if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = 'https://careers.cbiz.com/en-US/search?pagenumber={}' page_num = 1 while True: page = self.get_request(url.format(page_num), verify=False) result = self.extract_page_urls(page) page_num += 1 if not result: break for url in self.urls_to_scrape: page = self.get_request(url, 'response', verify=False) self.extract_page_data(page) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): page_num = 1 template = ( "https://cmsservice.smashfly.com/api/jobs/v1/jobs/hZtAUIBJAtYt3u6LLr6IZTmj9c2V39Q6ouvFD2DCuMtzWQbmN-" + "GSIsuEta_wfak70?sort=AddedOn-asc&page={}&pageSize=100&group=&filter=&fields=JobTitle%2CShortTextField9" + "%2CLocation%2CDisplayJobId%2CUrlJobTitle") while True: page = self.get_request(template.format(page_num), headers=HEADERS, out_format='json') if not page['Data']: break self.extract_page_data(page) page_num += 1 if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = "https://eisneramper.wd1.myworkdayjobs.com/EisnerAmper_External?clientRequestID=369870641bd94ccd8b96ead80332674d" page_num = 0 while True: try: # attempt to request json data if exists page = self.get_request(url, out_format='json', headers=HEADERS) except ws.json.JSONDecodeError: break self.extract_page_data(page) page_num += 50 url = get_next_page(page, page_num) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = 'https://us-jobs.kpmg.com/careers/SearchResults/?jobOffset={}' page_num = 0 self.create_session() self.session.verify = False ws.requests.packages.urllib3.disable_warnings( category=InsecureRequestWarning) while True: page = self.get_request(url.format(page_num), use_session=True, out_format='soup') if self.extract_page_data(page): page_num += 10 else: break if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" next_page = "https://bkd.avature.net/experiencedcareers/SearchJobs/?jobRecordsPerPage=6&jobOffset={}" page_offset = 0 while True: soup = self.get_request(next_page.format(page_offset)) cards = soup.find_all('article', 'article article--result') for card in cards: record = self.extract_card_data(card) if record: self.data_scraped.append(record) if len(cards) == 1: break page_offset += 6 if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): url = "https://marcum-hr.secure.force.com/recruit/fRecruit__ApplyJobList" self.create_webdriver(implicit_wait=5, headless=True) self.driver.get(url) # get all job urls while True: self.extract_page_urls(self.driver.page_source) try: self.driver.find_element(By.LINK_TEXT, 'Next').click() except ws.NoSuchElementException: break self.driver.close() # extract job data for page in self.urls_to_scrape: self.extract_page_data(page) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = 'https://sjobs.brassring.com/TgNewUI/Search/Ajax/ProcessSortAndShowMoreJobs' running = True page_num = 1 while running: PAYLOAD['pageNumber'] = str(page_num) page = self.post_request(url, headers=HEADERS, data=PAYLOAD, out_format='json') if self.extract_page_data(page): page_num += 1 ws.sleep(0.5) else: break if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): """Run the scraper""" url = "https://cliftonlarsonallen.wd1.myworkdayjobs.com/CLA" template_url = "https://cliftonlarsonallen.wd1.myworkdayjobs.com{}/{}?clientRequestID={}" page_num = 0 while True: try: page = self.get_request(url, out_format='json', headers=HEADERS) except json.JSONDecodeError: break self.extract_page_data(page) page_num += 50 uri = page['body']['children'][0]['endPoints'][1]['uri'] client_request_id = page['sessionSecureToken'] url = template_url.format(uri, page_num, client_request_id) if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")
def run(self): page_num = 1 template = ( "https://cmsservice.smashfly.com/api/jobs/v1/jobs/hZtAUIBJAtYt3u6LLr6IZbnp13StNSmiW6NF93TgvUivZ-" + "F70L96Q_XDj5YYbxiHlB-6xq-UIZE1?sort=AddedOn-desc&page={}&pageSize=100&group=&filter=&fields=Display" + "JobId%2CJobTitle%2CShortTextField4%2CLongTextField7%2CLongTextField5%2CUrlJobTitle" ) while True: page = self.get_request(template.format(page_num), headers=HEADERS, out_format='json') if not page['Data']: break self.extract_page_data(page) page_num += 1 if self.data_scraped: DataTools.save_to_database(self.data_scraped, CONN_STRING, INSERT_QUERY) print(f"{self.name} >> {len(self.data_scraped)} records")