def _get_info_from_guru_job_page_soup(self, posting_soup): posting = Posting({'source': self.source}) # url, and unique id url = posting_soup.find('meta', attrs=GuruScraper._job_url_meta_attrs) if url is not None: posting['url'] = self._clean_post_url(url['content']) url_parts = urlsplit(posting['url']) if url_parts.path: sections = url_parts.path.rsplit('/') [sections.remove(s) for s in sections if not s] posting['unique_id'] = sections[-1] # title try: title_header = posting_soup.find(attrs=GuruScraper._job_title_attrs) posting['title'] = PostingScraper._encode_unicode(title_header.text) except AttributeError: pass # date posted try: date_posted_span = posting_soup.find('span', attrs=GuruScraper._job_date_posted_span_attrs) posting['date_posted'] = safe_dt_parse(date_posted_span['data-date']) except (KeyError, AttributeError, ValueError): # traceback.print_exc(file=stderr) pass # duration try: duration_span = posting_soup.find('span', attrs=GuruScraper._job_duration_span_attrs) actual_date_span = duration_span.find('span') posting['duration'] = safe_dt_parse(actual_date_span['data-date']) except (KeyError, AttributeError, ValueError, TypeError): # traceback.print_exc(file=stderr) pass # budget try: budget_div = posting_soup.find('div', attrs=GuruScraper._job_budget_div_attrs) posting['budget'] = PostingScraper._encode_unicode(budget_div.text) except AttributeError: # traceback.print_exc(file=stderr) pass # skills try: skills_section = posting_soup.find(attrs=GuruScraper._job_skill_section_attrs) posting['skills'] = map(lambda x: PostingScraper._encode_unicode(PostingScraper._get_cleaned_soup_from_html(x).text), skills_section.find('a', attrs=GuruScraper._job_skill_link_attrs)) except AttributeError: # traceback.print_exc(file=stderr) pass # experience, desription try: description_section = posting_soup.find(attrs=GuruScraper._job_experience_reqs_section_attrs) posting['description'] = PostingScraper._encode_unicode( PostingScraper._get_cleaned_soup_from_html(str(description_section)).text) except AttributeError: # traceback.print_exc(file=stderr) pass return posting
def _get_info_from_indeed_result(self, row_result_soup): posting = Posting({"source": self.source}) # url, title try: url_data = row_result_soup.find("a") posting["url"] = self._clean_post_url(url_data["href"]) posting["title"] = PostingScraper._encode_unicode(url_data.text) except (AttributeError, KeyError, TypeError): pass # id try: posting["unique_id"] = row_result_soup["data-jk"] except KeyError: pass # location try: loc = row_result_soup.find("span", IndeedScraper._job_location_span_attrs).text if loc is not None: g = geocoder.google(loc, method="reverse") posting["location"] = (g.city, g.state, g.country) except (AttributeError, Exception): # traceback.print_exc(file=stderr) pass # date posted logic try: date_posted_span = row_result_soup.find("span", attrs=IndeedScraper._job_date_span_attrs) date_posted_text = PostingScraper._encode_unicode(date_posted_span.text).lower() if date_posted_text == "just posted": date_posted_text = "now" try: posting["date_posted"] = safe_dt_parse(date_posted_text) # also throws AttributeError except (AttributeError, ValueError): # traceback.print_exc(file=stderr) pass except AttributeError: # traceback.print_exc(file=stderr) pass # description try: posting["description"] = PostingScraper._encode_unicode( row_result_soup.find("span", attrs=IndeedScraper._job_description_span_attrs).text ) except AttributeError: # traceback.print_exc(file=stderr) pass if "description" not in posting: # try summary instead try: posting["description"] = PostingScraper._encode_unicode( row_result_soup.find("span", attrs=IndeedScraper._job_summary_span_attrs).text ) except AttributeError: # traceback.print_exc(file=stderr) pass return posting
def _get_info_from_clp_posting(self, url): posting = Posting({"source": self.source}) soup = PostingScraper._get_cleaned_soup_from_url(url) posting["url"] = url try: posting["title"] = PostingScraper._encode_unicode(soup.find(attrs=CraigslistScraper._post_title_attrs).text) except AttributeError: # traceback.print_exc(file=stderr) pass try: loc = soup.find(href=re.compile("google.com/maps"))["href"] g = geocoder.google(loc[loc.find("@") + 1 : -1].split(",")[:-1], method="reverse") posting["state"] = g.state posting["city"] = g.city posting["country"] = g.country except: pass try: posting["price"] = PostingScraper._encode_unicode( soup.find(text=re.compile(".*compensation.*")).parent.findChild("b").text ).replace("$", "") except: pass try: posting["description"] = PostingScraper._encode_unicode(soup.find("section", {"id": "postingbody"}).text) except: pass try: posting["date_posted"] = soup.find("time")["datetime"] except: pass try: posting["unique_id"] = "clgig" + re.match("[^\d]*(\d+).html", url).group(1) except: pass return posting
def _get_info_from_upwork_posting(self, posting_soup): """ Given an Upwork article HTML object, extract the desired information and return as a dict :param posting_soup: the Soup-ed HTML :return: the data in a dict """ posting = Posting({'source': self.source}) # url try: url = posting_soup.find('meta', attrs=UpworkScraper._job_url_attrs) if url is not None: posting['url'] = self._clean_post_url(url['content']) posting['title'] = PostingScraper._encode_unicode(url.text) url_parts = urlsplit(posting['url']) if url_parts.path: sections = url_parts.path.rsplit('/') [sections.remove(s) for s in sections if not s] posting['unique_id'] = sections[-1] except (KeyError, AttributeError): # traceback.print_exc(file=stderr) pass container = posting_soup.find(attrs=UpworkScraper._job_container_attrs) # date posted date_posted_span = container.find(attrs=UpworkScraper._job_dateposted_attrs) try: # it's in the 'popover' attribute posting['date_posted'] = safe_dt_parse(date_posted_span['popover']) except (KeyError, AttributeError, ValueError, TypeError): # traceback.print_exc(file=stderr) pass # price # second row of container, first element, first row inside that try: second_row = container.findAll('div', attrs=UpworkScraper._div_row_attrs)[1] try: first_child = second_row.find('div') price_row = first_child.find('div', attrs=UpworkScraper._div_row_attrs) try: posting['price_info'] = PostingScraper._encode_unicode(price_row.text) except AttributeError: # thrown if price_row is None # traceback.print_exc(file=stderr) pass except IndexError: # thrown if second_row doesn't have a first_child # traceback.print_exc(file=stderr) pass except IndexError: # thrown if container doesn't have a second 'row' tag # traceback.print_exc(file=stderr) pass # text try: description_air_card = container.find('div', attrs=UpworkScraper._job_descrip_aircard_attrs) posting['description'] = PostingScraper._encode_unicode(description_air_card.find('p').text) except AttributeError: # handle if soup finds nothing # traceback.print_exc(file=stderr) pass # skills try: posting['skills'] = map(lambda x: PostingScraper._encode_unicode(x.text), container.findAll('a', attrs=UpworkScraper._job_skill_tag_attrs)) except AttributeError: # handle if soup finds nothing for skills pass # unique id return posting
def _get_info_from_ziprecruiter_result(self, job_link): # print >> stderr, job_link soup = PostingScraper._get_cleaned_soup_from_url(job_link) if not len(soup): # print >> stderr, "returning none, soup false, link:%s" % job_link return None posting = Posting() # url try: url_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_ogurl_attrs) url = url_meta['content'] posting.add_url(url) # id is the last series of alphanumeric characters after the last hyphen in the url path # e.g., /jobs/proedit-inc-18374379/contract-scientific-writer-2ccbf90f # would mean 2ccbf90f things = urlsplit(url) path = things[2] last_hyphen = path.rfind('-') if last_hyphen != -1: # print >> stderr, path[last_hyphen+1:] posting.add_id(path[last_hyphen+1:]) else: # just take the whole url after the base # print >> stderr, "couldn't find id in url:%s" % url # print >> stderr, "making id:", path posting.add_id(path) except (TypeError, IndexError): # traceback.print_exc(file=stderr) pass # source posting.add_source(self.source) # title try: title_h1 = soup.find('h1', attrs=ZipRecruiterScraper._job_posting_h1_title_attrs) posting.add_title(PostingScraper._encode_unicode(title_h1.text)) except AttributeError: # if title_h1 is None # traceback.print_exc(file=stderr) pass # location try: # try to do the following first, more exact geoloc_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_latlong_attrs)['content'] geoloc_meta = re.sub(";", ",", geoloc_meta) g = geocoder.google(geoloc_meta.split(','), method='reverse') # print >> stderr, "reverse by latlong, loc:%s => g:%s" % (geoloc_meta, str(g)) posting['state'] = g.state posting['city'] = g.city posting['country'] = g.country except (TypeError, AttributeError, KeyError): # try to find the google map link try: maps_url = soup.find(href=re.compile(r"google\.com/maps|maps\.google\.com"))['href'] things = urlsplit(maps_url) params = things[3] loc = parse_qs(params) loc_str = loc['q'][0] g = geocoder.google(loc_str) # print >> stderr, "normal by loc:%s => g:%s" % (loc_str, g) posting['state'] = g.state posting['city'] = g.city posting['country'] = g.country except (TypeError, AttributeError, KeyError, IndexError): # traceback.print_exc(file=stderr) pass # date posted try: try: # try this first, but if we fail... date_posted_p = soup.find('p', attrs=ZipRecruiterScraper._job_posting_p_date_posted_attrs) # find first 'span' date_posted_span = date_posted_p.find('span') dt_text = re.sub(r"[Pp]osted", "", date_posted_span.text.lower()).strip() posting.add_date_posted(safe_dt_parse(dt_text)) except AttributeError: try: # ... double-check that we have a 'posted today / this week / 12 hours ago / whatever' locdiv = soup.find('div', attrs=ZipRecruiterScraper._job_posting_locdiv_attrs) header_div = locdiv.parent date_p = header_div.findAll('p')[-1] date_span = date_p.find('span') date_span_text = date_span.text.lower() if "posted" in date_span_text: # and if so, take appropriate action dt_text = re.sub(r"posted", "", date_span_text).strip() posting.add_date_posted(safe_dt_parse(dt_text)) else: # print >> stderr, "don't have a date found at url:%s" % job_link pass except (AttributeError, IndexError): # traceback.print_exc(file=stderr) pass except ValueError as e: print >> stderr, "error parsing date posted string at url:%s" % job_link pass # description try: description_div = soup.find('div', attrs=ZipRecruiterScraper._job_posting_div_description_attrs) posting.add_description(PostingScraper._encode_unicode(description_div.text)) except AttributeError: # traceback.print_exc(file=stderr) pass return posting
def _get_info_from_simplyhired_result(self, result_soup): posting = Posting({'source': self.source}) # external url, title try: title_link = result_soup.find('a', attrs=SimplyhiredScraper._job_title_link_attrs) posting['external_url'] = self._clean_post_url(title_link['href']) posting['title'] = PostingScraper._encode_unicode(title_link.text) except (AttributeError, KeyError, TypeError): # traceback.print_exc(file=stderr) pass # url description_page_url = None try: tools_container = result_soup.find('div', attrs=SimplyhiredScraper._job_tools_container_attrs) tools_links = tools_container.findAll('a') description_page_url = tools_links[-1]['href'] posting['url'] = description_page_url posting['unique_id'] = description_page_url # TODO i couldn't actually find a unique id? except (KeyError, AttributeError, IndexError): # traceback.print_exc(file=stderr) pass if description_page_url is not None: # follow posting url to long description, and date posted description_page_soup = PostingScraper._get_cleaned_soup_from_url(description_page_url) try: info_table = description_page_soup.find('table', attrs=SimplyhiredScraper._description_page_info_table_attrs) # 4 rows: Company, Location, Date Posted, Source row_data_two = [] trs = info_table.findAll('tr') for tr in trs: tds = tr.findAll('td') try: last_td = tds[-1] row_data_two.append(PostingScraper._encode_unicode(last_td.text)) except IndexError: # traceback.print_exc(file=stderr) pass info_labels = info_table.findAll('td', attrs=SimplyhiredScraper._description_page_table_info_label_attrs) info_labels = map(lambda x: PostingScraper._encode_unicode(x.text).lower(), info_labels) table_data = zip(info_labels, row_data_two) for label, value in table_data: if not value.strip(): continue if 'location' in label: try: g = geocoder.google(value, method='reverse') posting['location'] = (g.city, g.state, g.country) except Exception: # traceback.print_exc(file=stderr) pass elif 'date posted' in label: try: posting['date_posted'] = safe_dt_parse(value) except (AttributeError, ValueError): # traceback.print_exc(file=stderr) pass elif 'source' in label: posting['external_source'] = value elif 'company' in label: posting['company'] = value except AttributeError: # traceback.print_exc(file=stderr) pass # description try: description_div = description_page_soup.find('div', attrs=SimplyhiredScraper._description_page_description_attrs) posting['description'] = PostingScraper._encode_unicode(description_div.text) except AttributeError: # traceback.print_exc(file=stderr) pass return posting