def scraper(job_links, driver): # Get all LD skills to compare and extract skills from the job description ld_skills = get_all_skills(driver) today = datetime.datetime.now() filename = 'indeed_jobs_' + str( today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str( today.hour) + '_' + str(today.minute) + '.csv' #with open(os.path.dirname(os.path.abspath('indeed_jobs')) + '/indeed_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version with open(os.path.dirname(os.path.abspath('indeed_jobs')) + '/indeed_jobs/' + filename, 'w', newline='') as csvfile: # gobi version writer = csv.writer(csvfile) writer.writerow([ 'posted_date', 'skills', 'job_name', 'job_type', 'company_name', 'description', 'ld_link' ]) for link in job_links: driver.get(link) detail = driver.find_element_by_class_name( 'jobsearch-JobComponent') result = detail.get_attribute('innerHTML') soup = BeautifulSoup(result, 'html.parser') # Extracting the required data job_name = soup.find("h3").text company_name = soup.find("div", class_='icl-u-lg-mr--sm').text job_type = soup.find( "span", class_='jobsearch-JobMetadataHeader-item').text description = soup.find("div", class_='jobsearch-jobDescriptionText').text date = soup.find( "div", class_='jobsearch-JobMetadataFooter').text.split() # Change date into N days ago format index = date.index('-') + 1 parsed_date = date[index:] final_date = parsed_date[0:parsed_date.index('-')] posted_date = ' '.join(final_date) # Change N days ago format into date format if '30+' in posted_date: pass else: if (posted_date == 'Today'): posted_date = datetime.date.today() elif (posted_date == 'Just posted'): posted_date = datetime.date.today() else: if (posted_date == '1 day ago'): dt = datetime.timedelta(days=1) else: parsed_date = [posted_date.split()[:2]] if (parsed_date[0][1] == 'hour'): parsed_date[0][1] = 'hours' time_dict = dict((index, float(value)) for value, index in parsed_date) dt = datetime.timedelta(**time_dict) posted_date = datetime.datetime.now() - dt posted_date = posted_date.date() # To extract skills from description skills = [] all_text = description + job_name desc = modify(all_text) for skill in ld_skills: modified = modify(skill).lower() if modified in desc.lower(): skills.append(skill) ld_link = get_link(skills, driver) writer.writerow([ posted_date, skills, job_name, job_type, company_name, description, ld_link ]) print('New job record added: ', job_name) print('\nSuccessfully created a new csv file for indeed.com jobs - ' + filename + '.')
def scraper(job_links, driver): # Get all LD skills to compare and extract skills from the job description ld_skills = get_all_skills(driver) today = datetime.datetime.now() filename = 'weworkremotely_jobs_' + str( today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str( today.hour) + '_' + str(today.minute) + '.csv' with open(os.path.dirname(os.path.abspath('weworkremotely_jobs')) + '/weworkremotely_jobs/' + filename, 'w', newline='') as csvfile: # gobi version #with open(os.path.dirname(os.path.abspath('weworkremotely_jobs')) + '/weworkremotely_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version writer = csv.writer(csvfile) writer.writerow([ 'posted_date', 'skills', 'job_name', 'job_type', 'company_name', 'company_location', 'website', 'description', 'ld_link' ]) for link in job_links: driver.get(link) detail = driver.find_element_by_class_name('content') result = detail.get_attribute('innerHTML') soup = BeautifulSoup(result, 'html.parser') # Initialization job_type = [] skills = [] company_location = website = 'Not Mentioned' # Extracting the required data job_name = soup.find("h1").text.strip() header = soup.find("div", class_='listing-header-container').text posted_date = soup.find("div", class_='listing-header-container').find( "time").attrs.get('datetime') posted_date = parse(posted_date).date() basics = soup.find("div", class_='listing-header-container').find_all("a") description = soup.find("div", class_='listing-container').text company_card = soup.find("div", class_='company-card') # Extracting company's info company_name = company_card.find("h2").text company_info = company_card.find_all("h3") for data in company_info: if data.find("a"): website = data.find("a").attrs.get('href') else: company_location = data.text.strip() # Arrahge job type details in comma separated values for date in basics: job_type.append(date.text) job_type = ', '.join(job_type) # To extract skills from description all_text = description + header desc = modify(all_text) for skill in ld_skills: modified = modify(skill).lower() if modified in desc.lower(): skills.append(skill) ld_link = get_link(skills, driver) writer.writerow([ posted_date, skills, job_name, job_type, company_name, company_location, website, description, ld_link ]) print('New job record added: ', job_name) print( '\nSuccessfully created a new csv file for weworkremotely.com jobs - ' + filename + '.')
def scraper(filename, choice, job_links, driver): if (choice == 1): mode = 'w' else: mode = 'a' # For writing or updating/appending csv # with open(os.path.dirname(os.path.abspath('stackoverflow_jobs')) + '/stackoverflow_jobs/' + filename, mode , encoding='UTF-32', newline='') as csvfile: # anisha version with open(os.path.dirname(os.path.abspath('stackoverflow_jobs')) + '/stackoverflow_jobs/' + filename, mode, newline='') as csvfile: # gobi version writer = csv.writer(csvfile) if mode == 'w': writer.writerow([ 'posted_date', 'technologies', 'job_name', 'company', 'job_type', 'experience_level', 'role', 'industry', 'company_size', 'company_type', 'ld_link', 'description' ]) num_records_got = 0 # tmp for debug # Scraping for link in job_links: driver.get('https://stackoverflow.com' + link) detail = driver.find_element_by_id('mainbar') result = detail.get_attribute('innerHTML') soup = BeautifulSoup(result, 'html.parser') content = soup.find(class_='nav-content') # Initialization skills = [] technologies = [] about = [] job_type = experience = role = industry = company_size = company_type = 'Not mentioned' # Extracting the required data tags = content.find_all("a", class_='post-tag') for tag in tags: data = tag.text technologies.append(data) job_name = soup.find("a", class_='fc-black-900').text basics = content.find(class_='job-details--about').find_all( class_='mb8') company = soup.find("div", class_='fc-black-700').a.text description = content.find( class_='mb32 fs-body2 fc-medium pr48').find("div").text actual_date = content.find( class_='grid fs-body1 fc-black-500 gs8 ai-baseline mb24').text actual_date = actual_date.strip() # To change n_days_ago format into date format check_date = actual_date.split() if '<' in check_date: actual_date = actual_date.replace('Posted < ', '') else: actual_date = actual_date.replace('Posted ', '') if (actual_date == 'yesterday'): dt = datetime.timedelta(days=1) else: parsed_date = [actual_date.split()[:2]] if (parsed_date[0][1] == 'hour'): parsed_date[0][1] = 'hours' time_dict = dict( (index, float(value)) for value, index in parsed_date) dt = datetime.timedelta(**time_dict) posted_date = datetime.datetime.now() - dt posted_date = posted_date.date() # To assign values to respective variables from basics list for basic in basics: data = basic.text data = data.replace('\n', '') data = data.split(': ') final = [data[0], data[-1]] about.append(final) for info in about: title = info[0] desc = info[-1] if (title == 'Job type'): job_type = desc elif (title == 'Experience level'): experience = desc elif (title == 'Role'): role = desc elif (title == 'Industry'): industry = desc elif (title == 'Company size'): company_size = desc else: company_type = desc for skill in technologies: if '-' in skill: skill = skill.replace('-', ' ') skills.append(skill) ld_link = get_link(skills, driver) writer.writerow([ posted_date, technologies, job_name, company, job_type, experience, role, industry, company_size, company_type, ld_link, description ]) print('New job record added: ', job_name) num_records_got += 1 if num_records_got > 4: break #tmp for debugging # check_duplicate(filename) # no longer need to check for duplicates since we only need to create a new csv each time now (I commented this out since I got an error - not a priority to solve right now since we will only be using option 1 to create a new csv) if (mode == 'w'): print( '\nSuccessfully created new csv file for stackoverflow.com jobs - ' + filename + '.') else: print('\nSuccessfully updated', filename + '.')
def scraper(jobs, driver): # Get all LD skills to compare and extract skills from the job description ld_skills = get_all_skills(driver) today = datetime.datetime.now() filename = 'remoteok_jobs_' + str( today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str( today.hour) + '_' + str(today.minute) + '.csv' # with open(os.path.dirname(os.path.abspath('remoteok_jobs')) + '/remoteok_jobs/' + filename, 'w', encoding='UTF-16', newline='') as file: # anisha version with open(os.path.dirname(os.path.abspath('remoteok_jobs')) + '/remoteok_jobs/' + filename, 'w', newline='') as csvfile: # gobi version writer = csv.writer(csvfile) writer.writerow([ 'posted_date', 'skills', 'job_name', 'company_name', 'description', 'ld_link' ]) for link in jobs: driver.get('https://remoteok.io' + link) all_rows = driver.find_elements_by_tag_name("tr") for row in all_rows: if "job-" in row.get_attribute('id'): container = row break job = BeautifulSoup(container.get_attribute("innerHTML"), 'html.parser') # Extracting the required data job_detail = job.find( class_="company position company_and_position") company_name = job_detail.find("h3").text job_name = job_detail.find("h2").text tags = job.find(class_="tags").find_all(class_=re.compile("^tag")) description = driver.find_element_by_class_name('description').text posted_date = job.find("time")['datetime'] posted_date = datetime.datetime.strptime( ''.join(posted_date.rsplit(':', 1)), '%Y-%m-%dT%H:%M:%S%z') # Extract skills from tags skills = [] for tag in tags: tagname = str(tag.find('h3').text).title() skills.append(tagname) # Extract skills from description and append in skills list desc = modify(description) for skill in ld_skills: modified = modify(skill).lower() if modified in desc.lower(): skills.append(skill) ld_link = get_link(skills, driver) writer.writerow([ posted_date, skills, job_name, company_name, description, ld_link ]) print('New job record added: ', job_name) print('\nSuccessfully created a new csv file for remoteok.io jobs - ' + filename + '.')
def scraper(job_links, driver): today = datetime.datetime.now() filename = 'angel_co_jobs_' + str(today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str(today.hour) + '_' + str(today.minute) + '.csv' #with open(os.path.dirname(os.path.abspath('angel_co_jobs')) + '/angel_co_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version with open(os.path.dirname(os.path.abspath('angel_co_jobs')) + '/angel_co_jobs/' + filename, 'w', newline='') as csvfile: # gobi version writer = csv.writer(csvfile) writer.writerow(['skills', 'job_name', 'job_type', 'experience', 'company_name', 'industry', 'company_size', 'location', 'hiring_contact', 'website', 'description', 'ld_link']) #tmp_limit = 0 for link in job_links: try: driver.get(link) detail = driver.find_element_by_class_name('wrapper_06a53') result = detail.get_attribute('innerHTML') soup = BeautifulSoup(result, 'html.parser') content = soup.find(class_ = 'content_50e69') # Initialization about_job = [] skills = [] industry = [] job_type = experience = company_size = location = hiring_contact = website = 'Not mentioned' # Extracting the required data job_name = content.find("h2", class_ = 'header_ec0af').text basics = content.find("div", class_ = 'component_4105f').find_all("div",class_ = 'characteristic_650ae') company_name = content.find("div", class_ = 'name_af83c').find("h1").text about_company = soup.find("div", class_ = 'component_3298f').find_all("dt") description = content.find("div", class_ = 'description_c90c4').text # Extract company size try: for about in about_company: extracts = about.text if(" people" in extracts): company_size = extracts except: pass # Extract website link try: website = soup.find("li", class_ = 'websiteLink_b71b4').find("a").get('href') except: pass # Extract hiring contact try: hiring = content.find("div", class_ = 'recruitingContact_82245').find("h4", class_ = 'name_9d036').text hiring_post = content.find("div", class_ = 'recruitingContact_82245').find("span").text hiring_contact = hiring + ', ' + hiring_post except: pass # Extract industry details try: industry_info = soup.find("div", class_ = 'component_3298f').find("dt", class_ = 'tags_70e20').find_all("a") for info in industry_info: industry.append(info.text) except: pass for basic in basics: title = basic.find("dt").text # Extract skills if (title == 'Skills'): all_skill = basic.find_all("a") for skill in all_skill: one_skill = skill.text skills.append(one_skill) else: desc = basic.find("dd").text pair = [title, desc] about_job.append(pair) # Extract basic details about job for info in about_job: title = info[0] desc = info[-1] if (title == 'Location'): location = desc elif (title == 'Job type'): job_type = desc elif (title == 'Experience'): experience = desc else: continue ld_link = get_link(skills, driver) writer.writerow([skills, job_name, job_type, experience, company_name, industry, company_size, location, hiring_contact, website, description, ld_link]) print('New job record added: ', job_name) except: print('Skipping one link since it did not work.') #tmp_limit += 1 #if tmp_limit > 10: # break print('\nSuccessfully created a new csv file for angel.co jobs - ' + filename + '.')