示例#1
0
 def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
     """Set a single job attribute from a soup object by JobField
     NOTE: priority is: HIGH: RAW, LOW: DESCRIPTION / TAGS
     """
     if parameter == JobField.RAW:
         job._raw_scrape_data = BeautifulSoup(
             self.session.get(job.url).text, self.config.bs4_parser)
     elif parameter == JobField.WAGE:
         pot_wage_cell = job._raw_scrape_data.find(
             'div', attrs={'class': 'col-xs-12 cell'})
         if pot_wage_cell:
             pot_wage_value = pot_wage_cell.find('div')
             if pot_wage_value:
                 job.wage = pot_wage_value.text.strip()
     elif parameter == JobField.DESCRIPTION:
         assert job._raw_scrape_data
         job.description = job._raw_scrape_data.find(
             id='JobDescription').text.strip()
     elif parameter == JobField.TAGS:
         # NOTE: this seems a bit flimsy, monster allows a lot of flex. here
         assert job._raw_scrape_data
         tags = []  # type: List[str]
         for li in job._raw_scrape_data.find_all(
                 'section', attrs={'class': 'summary-section'}):
             table_key = li.find('dt')
             if (table_key and table_key.text.strip().lower()
                     in MONSTER_SIDEPANEL_TAG_ENTRIES):
                 table_value = li.find('dd')
                 if table_value:
                     tags.append(table_value.text.strip())
     else:
         raise NotImplementedError(f"Cannot set {parameter.name}")
示例#2
0
 def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
     """Set a single job attribute from a soup object by JobField
     NOTE: Description has to get and should be respectfully delayed
     """
     if parameter == JobField.RAW:
         job._raw_scrape_data = BeautifulSoup(
             self.session.get(job.url).text, self.config.bs4_parser)
     elif parameter == JobField.DESCRIPTION:
         assert job._raw_scrape_data
         job.description = job._raw_scrape_data.find(
             id='JobDescriptionContainer').text.strip()
     else:
         raise NotImplementedError(f"Cannot set {parameter.name}")
示例#3
0
    def filterable(self,
                   job: Job,
                   check_existing_duplicates: bool = True) -> bool:
        """Filter jobs out using all our available filters

        NOTE: this allows job to be partially initialized
        NOTE: if a job has UNKNOWN remoteness, we will include it anyways
        TODO: we should probably add some logging to this?

        Arguments:
            check_existing_duplicates: pass True to check if ID was previously
                detected to be a duplicate via TFIDF cosine similarity

        Returns:
            True if the job should be removed from incoming data, else False
        """
        return bool(job.status and job.is_remove_status
                    or (job.company in self.blocked_company_names_list)
                    or (job.post_date and self.max_job_date
                        and job.is_old(self.max_job_date))
                    or (job.key_id and self.user_block_jobs_dict
                        and job.key_id in self.user_block_jobs_dict)
                    or (check_existing_duplicates and self.is_duplicate(job))
                    or (job.remoteness != Remoteness.UNKNOWN
                        and self.desired_remoteness != Remoteness.ANY
                        and job.remoteness != self.desired_remoteness))
示例#4
0
 def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None:
     """Set a single job attribute from a soup object by JobField
     NOTE: URL is high-priority, since we need it to get RAW.
     """
     if parameter == JobField.RAW:
         job._raw_scrape_data = BeautifulSoup(
             self.session.get(job.url).text, self.config.bs4_parser)
     elif parameter == JobField.DESCRIPTION:
         assert job._raw_scrape_data
         job.description = job._raw_scrape_data.find(
             id='jobDescriptionText').text.strip()
     elif parameter == JobField.URL:
         assert job.key_id
         job.url = (f"http://www.indeed.{self.config.search_config.domain}/"
                    f"viewjob?jk={job.key_id}")
     else:
         raise NotImplementedError(f"Cannot set {parameter.name}")
示例#5
0
    def read_master_csv(self) -> Dict[str, Job]:
        """Read in the master-list CSV to a dict of unique Jobs

        TODO: make blurb --> description and add short_description

        Returns:
            Dict[str, Job]: unique Job objects in the CSV
        """
        jobs_dict = {}  # type: Dict[str, Job]
        with open(self.config.master_csv_file, 'r', encoding='utf8',
                  errors='ignore') as csvfile:
            for row in csv.DictReader(csvfile):

                # NOTE: we are doing legacy support here with 'blurb' etc.
                # In the future we should have an actual short description
                if 'short_description' in row:
                    short_description = row['short_description']
                else:
                    short_description = ''
                post_date = datetime.strptime(row['date'], '%Y-%m-%d')

                if 'scrape_date' in row:
                    scrape_date = datetime.strptime(
                        row['scrape_date'], '%Y-%m-%d'
                    )
                else:
                    scrape_date = post_date

                if 'raw' in row:
                    # NOTE: we should never see this because raw cant be in CSV
                    raw = row['raw']
                else:
                    raw = None

                # FIXME: this is the wrong way to compare row val to Enum.name!
                # We need to convert from user statuses
                status = None
                if 'status' in row:
                    status_str = row['status'].strip()
                    for p_status in JobStatus:
                        if status_str.lower() == p_status.name.lower():
                            status = p_status
                            break
                if not status:
                    self.logger.warning(
                        "Unknown status %s, setting to UNKNOWN", status_str
                    )
                    status = JobStatus.UNKNOWN

                # NOTE: this is for legacy support:
                locale = None
                if 'locale' in row:
                    locale_str = row['locale'].strip()
                    for p_locale in Locale:
                        if locale_str.lower() == p_locale.name.lower():
                            locale = p_locale
                            break
                if not locale:
                    self.logger.warning(
                        "Unknown locale %s, setting to UNKNOWN", locale_str
                    )
                    locale = locale.UNKNOWN

                # Check for remoteness (handle if not present for legacy)
                remoteness = Remoteness.UNKNOWN
                if 'remoteness' in row:
                    remote_str = row['remoteness'].strip()
                    remoteness = Remoteness[remote_str]
                if not locale:
                    self.logger.warning(
                        "Unknown locale %s, setting to UNKNOWN", locale_str
                    )
                    locale = locale.UNKNOWN

                # Check for wage (handle if not present for legacy
                wage = ''
                if 'wage' in row:
                    wage = row['wage'].strip()
                    
                job = Job(
                    title=row['title'],
                    company=row['company'],
                    location=row['location'],
                    description=row['blurb'],
                    key_id=row['id'],
                    url=row['link'],
                    locale=locale,
                    query=row['query'],
                    status=status,
                    provider=row['provider'],
                    short_description=short_description,
                    post_date=post_date,
                    scrape_date=scrape_date,
                    raw=raw,
                    tags=row['tags'].split(','),
                    remoteness=remoteness,
                )
                job.validate()
                jobs_dict[job.key_id] = job

        self.logger.debug(
            "Read %d jobs from master-CSV: %s",
            len(jobs_dict.keys()), self.config.master_csv_file
        )
        return jobs_dict
示例#6
0
文件: base.py 项目: ncolyer/JobFunnel
    def scrape_job(self, job_soup: BeautifulSoup, delay: float,
                   delay_lock: Optional[Lock] = None) -> Optional[Job]:
        """Scrapes a search page and get a list of soups that will yield jobs
        Arguments:
            job_soup (BeautifulSoup): This is a soup object that your get/set
                will use to perform the get/set action. It should be specific
                to this job and not contain other job information.
            delay (float): how long to delay getting/setting for certain
                get/set calls while scraping data for this job.
            delay_lock (Optional[Manager.Lock], optional): semaphore for
                synchronizing respectful delaying across workers

        NOTE: this will never raise an exception to prevent killing workers,
            who are building jobs sequentially.

        Returns:
            Optional[Job]: job object constructed from the soup and localization
                of class, returns None if scrape failed.
        """
        # Scrape the data for the post, requiring a minimum of info...
        # NOTE: if we perform a self.session.get we may get respectfully delayed
        job = None  # type: Optional[Job]
        job_init_kwargs = self.job_init_kwargs  # NOTE: faster?
        for is_get, field in self._actions_list:

            # Break out immediately because we have failed a filterable
            # condition with something we initialized while scraping.
            if job and self.job_filter.filterable(job):
                if self.job_filter.is_duplicate(job):
                    # NOTE: if we pre-empt scraping duplicates we cannot update
                    # the existing job listing with the new information!
                    # TODO: make this behaviour configurable? ('minimal-get' ?)
                    self.logger.debug(
                        "Scraped job %s has key_id in known duplicates list. "
                        "Continuing scrape of job to update existing job "
                        "attributes.",
                        job.key_id
                    )
                else:
                    self.logger.debug(
                        "Cancelled scraping of %s, failed JobFilter",
                        job.key_id
                    )
                    break

            # Respectfully delay if it's configured to do so.
            if field in self.delayed_get_set_fields:
                if delay_lock:
                    self.logger.debug("Delaying for %.4f", delay)
                    with delay_lock:
                        sleep(delay)
                else:
                    sleep(delay)

            try:
                if is_get:
                    job_init_kwargs[field] = self.get(field, job_soup)
                else:
                    if not job:
                        # Build initial job object + populate all the job
                        job = Job(**{
                            k.name.lower(): v for k, v
                            in job_init_kwargs.items()
                        })
                    self.set(field, job, job_soup)

            except Exception as err:

                # TODO: we should really dump the soup object to an XML file
                # so that users encountering bugs can submit it and we can
                # quickly fix any failing scraping.

                if field in self.min_required_job_fields:
                    raise ValueError(
                        "Unable to scrape minimum-required job field: "
                        f"{field.name} Got error:{str(err)}. {job.url}"
                    )
                else:
                    # Crash out gracefully so we can continue scraping.
                    self.logger.warning(
                        "Unable to scrape %s for job: %s. %s",
                        field.name.lower(),
                        err,
                        job.url,
                    )

        # Validate job fields if we got something
        if job:
            try:
                job.validate()
            except Exception as err:
                # Bad job scrapes can't take down execution!
                # NOTE: desc too short etc, usually indicates that the job
                # is an empty page. Not sure why this comes up once in awhile...
                self.logger.error("Job failed validation: %s", err)
                return None

        return job