def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None: """Set a single job attribute from a soup object by JobField NOTE: priority is: HIGH: RAW, LOW: DESCRIPTION / TAGS """ if parameter == JobField.RAW: job._raw_scrape_data = BeautifulSoup( self.session.get(job.url).text, self.config.bs4_parser) elif parameter == JobField.WAGE: pot_wage_cell = job._raw_scrape_data.find( 'div', attrs={'class': 'col-xs-12 cell'}) if pot_wage_cell: pot_wage_value = pot_wage_cell.find('div') if pot_wage_value: job.wage = pot_wage_value.text.strip() elif parameter == JobField.DESCRIPTION: assert job._raw_scrape_data job.description = job._raw_scrape_data.find( id='JobDescription').text.strip() elif parameter == JobField.TAGS: # NOTE: this seems a bit flimsy, monster allows a lot of flex. here assert job._raw_scrape_data tags = [] # type: List[str] for li in job._raw_scrape_data.find_all( 'section', attrs={'class': 'summary-section'}): table_key = li.find('dt') if (table_key and table_key.text.strip().lower() in MONSTER_SIDEPANEL_TAG_ENTRIES): table_value = li.find('dd') if table_value: tags.append(table_value.text.strip()) else: raise NotImplementedError(f"Cannot set {parameter.name}")
def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None: """Set a single job attribute from a soup object by JobField NOTE: Description has to get and should be respectfully delayed """ if parameter == JobField.RAW: job._raw_scrape_data = BeautifulSoup( self.session.get(job.url).text, self.config.bs4_parser) elif parameter == JobField.DESCRIPTION: assert job._raw_scrape_data job.description = job._raw_scrape_data.find( id='JobDescriptionContainer').text.strip() else: raise NotImplementedError(f"Cannot set {parameter.name}")
def filterable(self, job: Job, check_existing_duplicates: bool = True) -> bool: """Filter jobs out using all our available filters NOTE: this allows job to be partially initialized NOTE: if a job has UNKNOWN remoteness, we will include it anyways TODO: we should probably add some logging to this? Arguments: check_existing_duplicates: pass True to check if ID was previously detected to be a duplicate via TFIDF cosine similarity Returns: True if the job should be removed from incoming data, else False """ return bool(job.status and job.is_remove_status or (job.company in self.blocked_company_names_list) or (job.post_date and self.max_job_date and job.is_old(self.max_job_date)) or (job.key_id and self.user_block_jobs_dict and job.key_id in self.user_block_jobs_dict) or (check_existing_duplicates and self.is_duplicate(job)) or (job.remoteness != Remoteness.UNKNOWN and self.desired_remoteness != Remoteness.ANY and job.remoteness != self.desired_remoteness))
def set(self, parameter: JobField, job: Job, soup: BeautifulSoup) -> None: """Set a single job attribute from a soup object by JobField NOTE: URL is high-priority, since we need it to get RAW. """ if parameter == JobField.RAW: job._raw_scrape_data = BeautifulSoup( self.session.get(job.url).text, self.config.bs4_parser) elif parameter == JobField.DESCRIPTION: assert job._raw_scrape_data job.description = job._raw_scrape_data.find( id='jobDescriptionText').text.strip() elif parameter == JobField.URL: assert job.key_id job.url = (f"http://www.indeed.{self.config.search_config.domain}/" f"viewjob?jk={job.key_id}") else: raise NotImplementedError(f"Cannot set {parameter.name}")
def read_master_csv(self) -> Dict[str, Job]: """Read in the master-list CSV to a dict of unique Jobs TODO: make blurb --> description and add short_description Returns: Dict[str, Job]: unique Job objects in the CSV """ jobs_dict = {} # type: Dict[str, Job] with open(self.config.master_csv_file, 'r', encoding='utf8', errors='ignore') as csvfile: for row in csv.DictReader(csvfile): # NOTE: we are doing legacy support here with 'blurb' etc. # In the future we should have an actual short description if 'short_description' in row: short_description = row['short_description'] else: short_description = '' post_date = datetime.strptime(row['date'], '%Y-%m-%d') if 'scrape_date' in row: scrape_date = datetime.strptime( row['scrape_date'], '%Y-%m-%d' ) else: scrape_date = post_date if 'raw' in row: # NOTE: we should never see this because raw cant be in CSV raw = row['raw'] else: raw = None # FIXME: this is the wrong way to compare row val to Enum.name! # We need to convert from user statuses status = None if 'status' in row: status_str = row['status'].strip() for p_status in JobStatus: if status_str.lower() == p_status.name.lower(): status = p_status break if not status: self.logger.warning( "Unknown status %s, setting to UNKNOWN", status_str ) status = JobStatus.UNKNOWN # NOTE: this is for legacy support: locale = None if 'locale' in row: locale_str = row['locale'].strip() for p_locale in Locale: if locale_str.lower() == p_locale.name.lower(): locale = p_locale break if not locale: self.logger.warning( "Unknown locale %s, setting to UNKNOWN", locale_str ) locale = locale.UNKNOWN # Check for remoteness (handle if not present for legacy) remoteness = Remoteness.UNKNOWN if 'remoteness' in row: remote_str = row['remoteness'].strip() remoteness = Remoteness[remote_str] if not locale: self.logger.warning( "Unknown locale %s, setting to UNKNOWN", locale_str ) locale = locale.UNKNOWN # Check for wage (handle if not present for legacy wage = '' if 'wage' in row: wage = row['wage'].strip() job = Job( title=row['title'], company=row['company'], location=row['location'], description=row['blurb'], key_id=row['id'], url=row['link'], locale=locale, query=row['query'], status=status, provider=row['provider'], short_description=short_description, post_date=post_date, scrape_date=scrape_date, raw=raw, tags=row['tags'].split(','), remoteness=remoteness, ) job.validate() jobs_dict[job.key_id] = job self.logger.debug( "Read %d jobs from master-CSV: %s", len(jobs_dict.keys()), self.config.master_csv_file ) return jobs_dict
def scrape_job(self, job_soup: BeautifulSoup, delay: float, delay_lock: Optional[Lock] = None) -> Optional[Job]: """Scrapes a search page and get a list of soups that will yield jobs Arguments: job_soup (BeautifulSoup): This is a soup object that your get/set will use to perform the get/set action. It should be specific to this job and not contain other job information. delay (float): how long to delay getting/setting for certain get/set calls while scraping data for this job. delay_lock (Optional[Manager.Lock], optional): semaphore for synchronizing respectful delaying across workers NOTE: this will never raise an exception to prevent killing workers, who are building jobs sequentially. Returns: Optional[Job]: job object constructed from the soup and localization of class, returns None if scrape failed. """ # Scrape the data for the post, requiring a minimum of info... # NOTE: if we perform a self.session.get we may get respectfully delayed job = None # type: Optional[Job] job_init_kwargs = self.job_init_kwargs # NOTE: faster? for is_get, field in self._actions_list: # Break out immediately because we have failed a filterable # condition with something we initialized while scraping. if job and self.job_filter.filterable(job): if self.job_filter.is_duplicate(job): # NOTE: if we pre-empt scraping duplicates we cannot update # the existing job listing with the new information! # TODO: make this behaviour configurable? ('minimal-get' ?) self.logger.debug( "Scraped job %s has key_id in known duplicates list. " "Continuing scrape of job to update existing job " "attributes.", job.key_id ) else: self.logger.debug( "Cancelled scraping of %s, failed JobFilter", job.key_id ) break # Respectfully delay if it's configured to do so. if field in self.delayed_get_set_fields: if delay_lock: self.logger.debug("Delaying for %.4f", delay) with delay_lock: sleep(delay) else: sleep(delay) try: if is_get: job_init_kwargs[field] = self.get(field, job_soup) else: if not job: # Build initial job object + populate all the job job = Job(**{ k.name.lower(): v for k, v in job_init_kwargs.items() }) self.set(field, job, job_soup) except Exception as err: # TODO: we should really dump the soup object to an XML file # so that users encountering bugs can submit it and we can # quickly fix any failing scraping. if field in self.min_required_job_fields: raise ValueError( "Unable to scrape minimum-required job field: " f"{field.name} Got error:{str(err)}. {job.url}" ) else: # Crash out gracefully so we can continue scraping. self.logger.warning( "Unable to scrape %s for job: %s. %s", field.name.lower(), err, job.url, ) # Validate job fields if we got something if job: try: job.validate() except Exception as err: # Bad job scrapes can't take down execution! # NOTE: desc too short etc, usually indicates that the job # is an empty page. Not sure why this comes up once in awhile... self.logger.error("Job failed validation: %s", err) return None return job