def run(url): soup = get_soup(url) jobs_list = soup.select('div[class*="JobGrid-"]')[0] job_class= Job(organization, "") job_class.organization_id= organization_id insert_count= 0 for job_entry in jobs_list.find_all('a'): job_class.info_link = 'https://path.catsone.com' + job_entry['href'] job_row = job_entry.find('div', {'class': 'row'}) job_divs = job_row.find_all('div') job_class.title = job_divs[0].text.strip() job_class.location = clean_location(job_divs[2].text.strip()) job_class.zip_code = city_to_zip(job_class.location) insert_count+= job_insert(job_class) # Possible to get more info by scraping each job link, but the listings are extremely poorly written/standardized; scraper below works for most of the listings, but a few poorly written listings break the scraper # job_soup = get_soup(info_link) # job_description = job_soup.find('div',{'class':'Job__StyledDescription-s1h17u0t-0'}) # if '\n' in job_description.find_all('strong')[0].text: # full_or_part = job_description.find_all('strong')[0].text.split('\n')[1].strip() # salary = job_description.find_all('strong')[0].text.split('\n')[2].strip().split(': ')[1] # else: # full_or_part = job_description.find_all('strong')[1].text.strip() # salary = job_description.find_all('strong')[2].text.split('\n')[0].split(':')[1].strip() return insert_count
def run(url): soup = get_soup(url) jobs_list = soup.find_all('div', {'class': 'list-data'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_entry in jobs_list: job_info = job_entry.find('div', {'class': 'job-info'}) job_class.title = job_info.find('span', { 'class': 'job-title' }).text.strip() job_class.info_link = job_info.h4.a['href'] job_class.full_or_part = job_entry.find('div', { 'class': 'job-type' }).text.strip() job_class.location = clean_location( job_entry.find('div', { 'class': 'job-location' }).text.strip()) job_class.zip_code = city_to_zip(job_class.location) relative_date = job_entry.find('div', { 'class': 'job-date' }).text.strip().split(' ') job_class.post_date = date_ago(int(relative_date[1]), relative_date[2]) job_class.summary = job_entry.find('div', { 'class': 'job-description' }).p.text.strip() insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_soup(url) jobs_table = soup.find('table', {'id': 'job-result-table'}) job_class = Job(organization, "") job_class.post_date = "" job_class.organization_id = organization_id insert_count = 0 for job_row in jobs_table.find_all('tr', {'class': 'job-result'}): job_title_cell = job_row.find('td', {'class': 'job-result-title-cell'}) job_class.title = job_title_cell.a.text.strip() job_class.info_link = 'https://pennylanecenters.jobs.net' + \ job_title_cell.a['href'] job_class.location = clean_location( job_row.find('div', { 'class': 'job-location-line' }).text) job_class.zip_code = city_to_zip(job_class.location) # Get Job Soup job_soup = get_soup(job_class.info_link) job_class.full_or_part = job_soup.find('li', { 'class': 'job-employee-type' }).find('div', { 'class': 'secondary-text-color' }).text job_class.post_date = string_to_date( job_soup.find('li', { 'class': 'job-date-posted' }).find('div', { 'class': 'secondary-text-color' }).text) insert_count += job_insert(job_class) return insert_count
def run(url): soup = get_javascript_soup(url) jobs_list = soup.find('table',{'class':'srJobList'}).tbody.find_all('tr')[1:] for job_entry in jobs_list: globals.job_title = job_entry.find('td',{'class':'srJobListJobTitle'}).text.strip() onClickLink = job_entry['onclick'] globals.info_link = onClickLink[13:len(onClickLink)-3] globals.full_or_part = job_entry.find('td',{'class':'srJobListTypeOfEmployment'}).text globals.job_location = clean_location(job_entry.find('td',{'class':'srJobListLocation'}).text) globals.job_zip_code = city_to_zip(globals.job_location) update_db(organization)
def run(url): soup = get_soup(url) jobs_list = soup.select('div[class*="JobGrid-"]')[0] for job_entry in jobs_list.find_all('a'): globals.info_link = 'https://path.catsone.com' + job_entry['href'] job_row = job_entry.find('div', {'class':'row'}) job_divs = job_row.find_all('div') globals.job_title = job_divs[0].text.strip() globals.job_location = clean_location(job_divs[2].text.strip()) globals.job_zip_code = city_to_zip(globals.job_location) update_db(organization)
def run(url): globals.job_post_date = '' next_page_url = url soup = get_javascript_soup_delayed(next_page_url, 'job-table-title') while soup: job_table = soup.find('tbody') for job_row in job_table.find_all('tr'): globals.job_title = job_row.find('td', { 'class': 'job-table-title' }).a.text.strip() globals.info_link = 'https://www.governmentjobs.com' + job_row.find( 'td', { 'class': 'job-table-title' }).a['href'] globals.salary = job_row.find('td', { 'class': 'job-table-salary' }).text globals.full_or_part = job_row.find('td', { 'class': 'job-table-type' }).text # Get soup for job listing to get more info job_soup = get_soup(globals.info_link) info_container = job_soup.find('div', {'class': 'summary container'}) globals.job_location = clean_location( info_container.find('div', { 'id': 'location-label-id' }).parent.find_all('div')[2].text) globals.job_zip_code = city_to_zip(globals.job_location) globals.job_summary = job_soup.find('div', { 'id': 'details-info' }).find('p').text update_db(organization) reset_vars() if not 'disabled' in soup.find('li', { 'class': 'PagedList-skipToNext' }).get("class"): next_page_url = 'https://www.governmentjobs.com/careers/lahsa?' + soup.find( 'li', { 'class': 'PagedList-skipToNext' }).a['href'].split('?')[1] soup = get_javascript_soup_delayed(next_page_url, 'job-table-title') else: soup = False
def run(url): globals.job_post_date = '' soup = get_soup(url) jobs_table = soup.find('table',{'id':'job-result-table'}) for job_row in jobs_table.find_all('tr',{'class':'job-result'}): job_title_cell = job_row.find('td',{'class':'job-result-title-cell'}) globals.job_title = job_title_cell.a.text.strip() globals.info_link = 'https://pennylanecenters.jobs.net' + job_title_cell.a['href'] globals.job_summary = globals.info_link globals.job_location = clean_location(job_row.find('div',{'class':'job-location-line'}).text) globals.job_zip_code = city_to_zip(globals.job_location) # Get Job Soup job_soup = get_soup(globals.info_link) globals.full_or_part = job_soup.find('li',{'class':'job-employee-type'}).find('div',{'class':'secondary-text-color'}).text globals.job_post_date = string_to_date(job_soup.find('li',{'class':'job-date-posted'}).find('div',{'class':'secondary-text-color'}).text) update_db(organization) reset_vars()
def run(url): next_page_url = url soup = get_javascript_soup_delayed(next_page_url, 'job-table-title') job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 while soup: job_table = soup.find('tbody') for job_row in job_table.find_all('tr'): job_class.title = job_row.find('td', { 'class': 'job-table-title' }).a.text.strip() job_class.info_link = 'https://www.governmentjobs.com' + \ job_row.find('td', {'class': 'job-table-title'}).a['href'] job_class.salary = job_row.find('td', { 'class': 'job-table-salary' }).text job_class.full_or_part = job_row.find('td', { 'class': 'job-table-type' }).text # Get soup for job listing to get more info job_soup = get_soup(job_class.info_link) info_container = job_soup.find('div', {'class': 'summary container'}) job_class.location = clean_location( info_container.find('div', { 'id': 'location-label-id' }).parent.find_all('div')[2].text) job_class.zip_code = city_to_zip(job_class.location) job_class.summary = job_soup.find('div', { 'id': 'details-info' }).find('p').text insert_count += job_insert(job_class) if not 'disabled' in soup.find('li', { 'class': 'PagedList-skipToNext' }).get("class"): next_page_url = 'https://www.governmentjobs.com/careers/lahsa?' + \ soup.find('li', {'class': 'PagedList-skipToNext'} ).a['href'].split('?')[1] soup = get_javascript_soup_delayed(next_page_url, 'job-table-title') else: soup = False return insert_count
def run(url): soup = get_javascript_soup(url) jobs_list = soup.find( 'table', {'id': 'cws-search-results'}).find_all('tr')[1:] job_class= Job(organization, "") job_class.organization_id= organization_id insert_count= 0 for job_entry in jobs_list: row_cells = job_entry.find_all('td') job_class.title = row_cells[1].a.text.strip() job_class.info_link = row_cells[1].a['href'] job_class.location = clean_location(row_cells[2].text) job_class.zip_code = city_to_zip(job_class.location) job_soup = get_soup(job_class.info_link) job_class.full_or_part = job_soup.find( text="Employment Duration:").parent.parent.b.text.strip() insert_count+= job_insert(job_class) return insert_count
def run(url): soup = get_javascript_soup(url) job_listings = soup.find_all('div', {'class': 'jobInfo'}) job_class = Job(organization, "") job_class.organization_id = organization_id insert_count = 0 for job_listing in job_listings: job_class.title = job_listing.find('span', { 'class': 'jobTitle' }).a.text.strip() job_class.info_link = 'https://www.paycomonline.net' + \ job_listing.find('span', {'class': 'jobTitle'}).a['href'] if job_listing.find('span', {'class': 'jobLocation'}).text: job_class.location = globals.clean_location( job_listing.find('span', { 'class': 'jobLocation' }).text.split(' - ')[1]) job_class.zip_code = globals.city_to_zip(job_class.location) if job_listing.find('span', {'class': 'jobDescription'}).text: job_class.summary = job_listing.find('span', { 'class': 'jobDescription' }).text.strip() if job_listing.find('span', {'class': 'jobType'}).text: if ('ft' in str( job_listing.find('span', { 'class': 'jobType' }).text).lower()) or ('full' in str( job_listing.find('span', { 'class': 'jobType' }).text).lower()): job_class.full_or_part = 'full' else: job_class.full_or_part = 'part' insert_count += job_insert(job_class) return insert_count