Пример #1
0
def scraper(job_links, driver):

    # Get all LD skills to compare and extract skills from the job description
    ld_skills = get_all_skills(driver)

    today = datetime.datetime.now()
    filename = 'indeed_jobs_' + str(
        today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str(
            today.hour) + '_' + str(today.minute) + '.csv'

    #with open(os.path.dirname(os.path.abspath('indeed_jobs')) + '/indeed_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version
    with open(os.path.dirname(os.path.abspath('indeed_jobs')) +
              '/indeed_jobs/' + filename,
              'w',
              newline='') as csvfile:  # gobi version
        writer = csv.writer(csvfile)
        writer.writerow([
            'posted_date', 'skills', 'job_name', 'job_type', 'company_name',
            'description', 'ld_link'
        ])

        for link in job_links:
            driver.get(link)
            detail = driver.find_element_by_class_name(
                'jobsearch-JobComponent')
            result = detail.get_attribute('innerHTML')
            soup = BeautifulSoup(result, 'html.parser')

            # Extracting the required data
            job_name = soup.find("h3").text
            company_name = soup.find("div", class_='icl-u-lg-mr--sm').text
            job_type = soup.find(
                "span", class_='jobsearch-JobMetadataHeader-item').text
            description = soup.find("div",
                                    class_='jobsearch-jobDescriptionText').text
            date = soup.find(
                "div", class_='jobsearch-JobMetadataFooter').text.split()

            # Change date into N days ago format
            index = date.index('-') + 1
            parsed_date = date[index:]
            final_date = parsed_date[0:parsed_date.index('-')]
            posted_date = ' '.join(final_date)

            # Change N days ago format into date format
            if '30+' in posted_date:
                pass
            else:
                if (posted_date == 'Today'):
                    posted_date = datetime.date.today()
                elif (posted_date == 'Just posted'):
                    posted_date = datetime.date.today()
                else:
                    if (posted_date == '1 day ago'):
                        dt = datetime.timedelta(days=1)
                    else:
                        parsed_date = [posted_date.split()[:2]]
                        if (parsed_date[0][1] == 'hour'):
                            parsed_date[0][1] = 'hours'
                        time_dict = dict((index, float(value))
                                         for value, index in parsed_date)
                        dt = datetime.timedelta(**time_dict)
                    posted_date = datetime.datetime.now() - dt
                    posted_date = posted_date.date()

            # To extract skills from description
            skills = []
            all_text = description + job_name
            desc = modify(all_text)
            for skill in ld_skills:
                modified = modify(skill).lower()
                if modified in desc.lower():
                    skills.append(skill)

            ld_link = get_link(skills, driver)
            writer.writerow([
                posted_date, skills, job_name, job_type, company_name,
                description, ld_link
            ])
            print('New job record added: ', job_name)

        print('\nSuccessfully created a new csv file for indeed.com jobs - ' +
              filename + '.')
Пример #2
0
def scraper(job_links, driver):
    # Get all LD skills to compare and extract skills from the job description
    ld_skills = get_all_skills(driver)

    today = datetime.datetime.now()
    filename = 'weworkremotely_jobs_' + str(
        today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str(
            today.hour) + '_' + str(today.minute) + '.csv'

    with open(os.path.dirname(os.path.abspath('weworkremotely_jobs')) +
              '/weworkremotely_jobs/' + filename,
              'w',
              newline='') as csvfile:  # gobi version
        #with open(os.path.dirname(os.path.abspath('weworkremotely_jobs')) + '/weworkremotely_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version
        writer = csv.writer(csvfile)
        writer.writerow([
            'posted_date', 'skills', 'job_name', 'job_type', 'company_name',
            'company_location', 'website', 'description', 'ld_link'
        ])

        for link in job_links:
            driver.get(link)
            detail = driver.find_element_by_class_name('content')
            result = detail.get_attribute('innerHTML')
            soup = BeautifulSoup(result, 'html.parser')

            # Initialization
            job_type = []
            skills = []
            company_location = website = 'Not Mentioned'

            # Extracting the required data
            job_name = soup.find("h1").text.strip()
            header = soup.find("div", class_='listing-header-container').text
            posted_date = soup.find("div",
                                    class_='listing-header-container').find(
                                        "time").attrs.get('datetime')
            posted_date = parse(posted_date).date()
            basics = soup.find("div",
                               class_='listing-header-container').find_all("a")
            description = soup.find("div", class_='listing-container').text
            company_card = soup.find("div", class_='company-card')

            # Extracting company's info
            company_name = company_card.find("h2").text
            company_info = company_card.find_all("h3")
            for data in company_info:
                if data.find("a"):
                    website = data.find("a").attrs.get('href')
                else:
                    company_location = data.text.strip()

            # Arrahge job type details in comma separated values
            for date in basics:
                job_type.append(date.text)
            job_type = ', '.join(job_type)

            # To extract skills from description
            all_text = description + header
            desc = modify(all_text)
            for skill in ld_skills:
                modified = modify(skill).lower()
                if modified in desc.lower():
                    skills.append(skill)

            ld_link = get_link(skills, driver)
            writer.writerow([
                posted_date, skills, job_name, job_type, company_name,
                company_location, website, description, ld_link
            ])
            print('New job record added: ', job_name)

        print(
            '\nSuccessfully created a new csv file for weworkremotely.com jobs - '
            + filename + '.')
Пример #3
0
def scraper(filename, choice, job_links, driver):

    if (choice == 1):
        mode = 'w'
    else:
        mode = 'a'

    # For writing or updating/appending csv
    # with open(os.path.dirname(os.path.abspath('stackoverflow_jobs')) + '/stackoverflow_jobs/' + filename, mode , encoding='UTF-32', newline='') as csvfile: # anisha version
    with open(os.path.dirname(os.path.abspath('stackoverflow_jobs')) +
              '/stackoverflow_jobs/' + filename,
              mode,
              newline='') as csvfile:  # gobi version
        writer = csv.writer(csvfile)
        if mode == 'w':
            writer.writerow([
                'posted_date', 'technologies', 'job_name', 'company',
                'job_type', 'experience_level', 'role', 'industry',
                'company_size', 'company_type', 'ld_link', 'description'
            ])

        num_records_got = 0  # tmp for debug
        # Scraping
        for link in job_links:
            driver.get('https://stackoverflow.com' + link)
            detail = driver.find_element_by_id('mainbar')
            result = detail.get_attribute('innerHTML')
            soup = BeautifulSoup(result, 'html.parser')
            content = soup.find(class_='nav-content')

            # Initialization
            skills = []
            technologies = []
            about = []
            job_type = experience = role = industry = company_size = company_type = 'Not mentioned'

            # Extracting the required data
            tags = content.find_all("a", class_='post-tag')
            for tag in tags:
                data = tag.text
                technologies.append(data)

            job_name = soup.find("a", class_='fc-black-900').text
            basics = content.find(class_='job-details--about').find_all(
                class_='mb8')
            company = soup.find("div", class_='fc-black-700').a.text
            description = content.find(
                class_='mb32 fs-body2 fc-medium pr48').find("div").text
            actual_date = content.find(
                class_='grid fs-body1 fc-black-500 gs8 ai-baseline mb24').text
            actual_date = actual_date.strip()

            # To change n_days_ago format into date format
            check_date = actual_date.split()
            if '<' in check_date:
                actual_date = actual_date.replace('Posted < ', '')
            else:
                actual_date = actual_date.replace('Posted ', '')
            if (actual_date == 'yesterday'):
                dt = datetime.timedelta(days=1)
            else:
                parsed_date = [actual_date.split()[:2]]
                if (parsed_date[0][1] == 'hour'):
                    parsed_date[0][1] = 'hours'
                time_dict = dict(
                    (index, float(value)) for value, index in parsed_date)
                dt = datetime.timedelta(**time_dict)
            posted_date = datetime.datetime.now() - dt
            posted_date = posted_date.date()

            # To assign values to respective variables from basics list
            for basic in basics:
                data = basic.text
                data = data.replace('\n', '')
                data = data.split(': ')
                final = [data[0], data[-1]]
                about.append(final)

            for info in about:
                title = info[0]
                desc = info[-1]
                if (title == 'Job type'):
                    job_type = desc
                elif (title == 'Experience level'):
                    experience = desc
                elif (title == 'Role'):
                    role = desc
                elif (title == 'Industry'):
                    industry = desc
                elif (title == 'Company size'):
                    company_size = desc
                else:
                    company_type = desc

            for skill in technologies:
                if '-' in skill:
                    skill = skill.replace('-', ' ')
                skills.append(skill)

            ld_link = get_link(skills, driver)
            writer.writerow([
                posted_date, technologies, job_name, company, job_type,
                experience, role, industry, company_size, company_type,
                ld_link, description
            ])
            print('New job record added: ', job_name)

            num_records_got += 1
            if num_records_got > 4:
                break  #tmp for debugging

        # check_duplicate(filename) # no longer need to check for duplicates since we only need to create a new csv each time now (I commented this out since I got an error - not a priority to solve right now since we will only be using option 1 to create a new csv)

        if (mode == 'w'):
            print(
                '\nSuccessfully created new csv file for stackoverflow.com jobs - '
                + filename + '.')
        else:
            print('\nSuccessfully updated', filename + '.')
Пример #4
0
def scraper(jobs, driver):

    # Get all LD skills to compare and extract skills from the job description
    ld_skills = get_all_skills(driver)

    today = datetime.datetime.now()
    filename = 'remoteok_jobs_' + str(
        today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str(
            today.hour) + '_' + str(today.minute) + '.csv'

    # with open(os.path.dirname(os.path.abspath('remoteok_jobs')) + '/remoteok_jobs/' + filename, 'w', encoding='UTF-16', newline='') as file: # anisha version
    with open(os.path.dirname(os.path.abspath('remoteok_jobs')) +
              '/remoteok_jobs/' + filename,
              'w',
              newline='') as csvfile:  # gobi version
        writer = csv.writer(csvfile)
        writer.writerow([
            'posted_date', 'skills', 'job_name', 'company_name', 'description',
            'ld_link'
        ])

        for link in jobs:
            driver.get('https://remoteok.io' + link)
            all_rows = driver.find_elements_by_tag_name("tr")
            for row in all_rows:
                if "job-" in row.get_attribute('id'):
                    container = row
                    break
            job = BeautifulSoup(container.get_attribute("innerHTML"),
                                'html.parser')

            # Extracting the required data
            job_detail = job.find(
                class_="company position company_and_position")
            company_name = job_detail.find("h3").text
            job_name = job_detail.find("h2").text
            tags = job.find(class_="tags").find_all(class_=re.compile("^tag"))
            description = driver.find_element_by_class_name('description').text
            posted_date = job.find("time")['datetime']
            posted_date = datetime.datetime.strptime(
                ''.join(posted_date.rsplit(':', 1)), '%Y-%m-%dT%H:%M:%S%z')

            # Extract skills from tags
            skills = []
            for tag in tags:
                tagname = str(tag.find('h3').text).title()
                skills.append(tagname)

            # Extract skills from description and append in skills list
            desc = modify(description)
            for skill in ld_skills:
                modified = modify(skill).lower()
                if modified in desc.lower():
                    skills.append(skill)

            ld_link = get_link(skills, driver)
            writer.writerow([
                posted_date, skills, job_name, company_name, description,
                ld_link
            ])
            print('New job record added: ', job_name)

        print('\nSuccessfully created a new csv file for remoteok.io jobs - ' +
              filename + '.')
def scraper(job_links, driver):

  today = datetime.datetime.now()
  filename = 'angel_co_jobs_' + str(today.day) + '_' + strftime("%b") + '_' + str(today.year) + '_' + str(today.hour) + '_' + str(today.minute) + '.csv'

  #with open(os.path.dirname(os.path.abspath('angel_co_jobs')) + '/angel_co_jobs/' + filename, 'w', encoding='UTF-16', newline='') as csvfile: # anisha version
  with open(os.path.dirname(os.path.abspath('angel_co_jobs')) + '/angel_co_jobs/' + filename, 'w', newline='') as csvfile: # gobi version
    writer = csv.writer(csvfile)
    writer.writerow(['skills', 'job_name', 'job_type', 'experience', 'company_name', 'industry', 'company_size', 'location', 'hiring_contact', 'website', 'description', 'ld_link'])

    #tmp_limit = 0
    for link  in job_links:
      try:
        driver.get(link)
        detail  = driver.find_element_by_class_name('wrapper_06a53')
        result  = detail.get_attribute('innerHTML')
        soup    = BeautifulSoup(result, 'html.parser')
        content = soup.find(class_ = 'content_50e69')

        # Initialization
        about_job = []
        skills    = []
        industry  = []
        job_type  = experience = company_size = location = hiring_contact = website = 'Not mentioned'
        
        # Extracting the required data
        job_name      = content.find("h2", class_ = 'header_ec0af').text
        basics        = content.find("div", class_ = 'component_4105f').find_all("div",class_ = 'characteristic_650ae')
        company_name  = content.find("div", class_ = 'name_af83c').find("h1").text
        about_company = soup.find("div", class_ = 'component_3298f').find_all("dt")
        description   = content.find("div", class_ = 'description_c90c4').text

        # Extract company size
        try:
          for about in about_company:
            extracts = about.text
            if(" people" in extracts):
              company_size = extracts
        except:
          pass
        
        # Extract website link
        try:
          website = soup.find("li", class_ = 'websiteLink_b71b4').find("a").get('href')
        except:
          pass

        # Extract hiring contact
        try:
          hiring         = content.find("div", class_ = 'recruitingContact_82245').find("h4", class_ = 'name_9d036').text
          hiring_post    = content.find("div", class_ = 'recruitingContact_82245').find("span").text
          hiring_contact = hiring + ', ' + hiring_post
        except:
          pass
      
        # Extract industry details
        try:
          industry_info = soup.find("div", class_ = 'component_3298f').find("dt", class_ = 'tags_70e20').find_all("a")
          for info in industry_info:
            industry.append(info.text)
        except:
          pass

        for basic in basics:
          title = basic.find("dt").text
          # Extract skills 
          if (title == 'Skills'):
            all_skill = basic.find_all("a")
            for skill in all_skill:
              one_skill = skill.text
              skills.append(one_skill)
          else:
            desc = basic.find("dd").text
          pair = [title, desc]
          about_job.append(pair)
        
        # Extract basic details about job
        for info in about_job:
          title = info[0]
          desc = info[-1]
          if (title == 'Location'):
            location = desc
          elif (title == 'Job type'):
            job_type = desc
          elif (title == 'Experience'):
            experience = desc
          else:
            continue

        ld_link = get_link(skills, driver)
        writer.writerow([skills, job_name, job_type, experience, company_name, industry, company_size, location, hiring_contact, website, description, ld_link])
        print('New job record added: ', job_name)
      except:
        print('Skipping one link since it did not work.')
      #tmp_limit += 1
      #if tmp_limit > 10:
      #  break  

    print('\nSuccessfully created a new csv file for angel.co jobs - ' + filename + '.')