Python PostingScraper._encode_unicode 예제들, posting_scraper.PostingScraper._encode_unicode Python 예제들

예제 #1

0

파일 보기

파일: guru_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_guru_job_page_soup(self, posting_soup):
     posting = Posting({'source': self.source})
     # url, and unique id
     url = posting_soup.find('meta', attrs=GuruScraper._job_url_meta_attrs)
     if url is not None:
         posting['url'] = self._clean_post_url(url['content'])
         url_parts = urlsplit(posting['url'])
         if url_parts.path:
             sections = url_parts.path.rsplit('/')
             [sections.remove(s) for s in sections if not s]
             posting['unique_id'] = sections[-1]
     # title
     try:
         title_header = posting_soup.find(attrs=GuruScraper._job_title_attrs)
         posting['title'] = PostingScraper._encode_unicode(title_header.text)
     except AttributeError:
         pass
     # date posted
     try:
         date_posted_span = posting_soup.find('span', attrs=GuruScraper._job_date_posted_span_attrs)
         posting['date_posted'] = safe_dt_parse(date_posted_span['data-date'])
     except (KeyError, AttributeError, ValueError):
         # traceback.print_exc(file=stderr)
         pass
     # duration
     try:
         duration_span = posting_soup.find('span', attrs=GuruScraper._job_duration_span_attrs)
         actual_date_span = duration_span.find('span')
         posting['duration'] = safe_dt_parse(actual_date_span['data-date'])
     except (KeyError, AttributeError, ValueError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # budget
     try:
         budget_div = posting_soup.find('div', attrs=GuruScraper._job_budget_div_attrs)
         posting['budget'] = PostingScraper._encode_unicode(budget_div.text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # skills
     try:
         skills_section = posting_soup.find(attrs=GuruScraper._job_skill_section_attrs)
         posting['skills'] = map(lambda x:
                                 PostingScraper._encode_unicode(PostingScraper._get_cleaned_soup_from_html(x).text),
                                 skills_section.find('a', attrs=GuruScraper._job_skill_link_attrs))
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # experience, desription
     try:
         description_section = posting_soup.find(attrs=GuruScraper._job_experience_reqs_section_attrs)
         posting['description'] = PostingScraper._encode_unicode(
             PostingScraper._get_cleaned_soup_from_html(str(description_section)).text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     return posting

예제 #2

0

파일 보기

파일: indeed_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_indeed_result(self, row_result_soup):
     posting = Posting({"source": self.source})
     # url, title
     try:
         url_data = row_result_soup.find("a")
         posting["url"] = self._clean_post_url(url_data["href"])
         posting["title"] = PostingScraper._encode_unicode(url_data.text)
     except (AttributeError, KeyError, TypeError):
         pass
     # id
     try:
         posting["unique_id"] = row_result_soup["data-jk"]
     except KeyError:
         pass
     # location
     try:
         loc = row_result_soup.find("span", IndeedScraper._job_location_span_attrs).text
         if loc is not None:
             g = geocoder.google(loc, method="reverse")
             posting["location"] = (g.city, g.state, g.country)
     except (AttributeError, Exception):
         # traceback.print_exc(file=stderr)
         pass
     # date posted logic
     try:
         date_posted_span = row_result_soup.find("span", attrs=IndeedScraper._job_date_span_attrs)
         date_posted_text = PostingScraper._encode_unicode(date_posted_span.text).lower()
         if date_posted_text == "just posted":
             date_posted_text = "now"
         try:
             posting["date_posted"] = safe_dt_parse(date_posted_text)  # also throws AttributeError
         except (AttributeError, ValueError):
             # traceback.print_exc(file=stderr)
             pass
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     # description
     try:
         posting["description"] = PostingScraper._encode_unicode(
             row_result_soup.find("span", attrs=IndeedScraper._job_description_span_attrs).text
         )
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     if "description" not in posting:  # try summary instead
         try:
             posting["description"] = PostingScraper._encode_unicode(
                 row_result_soup.find("span", attrs=IndeedScraper._job_summary_span_attrs).text
             )
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
     return posting

예제 #3

0

파일 보기

파일: craigslist_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_clp_posting(self, url):
     posting = Posting({"source": self.source})
     soup = PostingScraper._get_cleaned_soup_from_url(url)
     posting["url"] = url
     try:
         posting["title"] = PostingScraper._encode_unicode(soup.find(attrs=CraigslistScraper._post_title_attrs).text)
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     try:
         loc = soup.find(href=re.compile("google.com/maps"))["href"]
         g = geocoder.google(loc[loc.find("@") + 1 : -1].split(",")[:-1], method="reverse")
         posting["state"] = g.state
         posting["city"] = g.city
         posting["country"] = g.country
     except:
         pass
     try:
         posting["price"] = PostingScraper._encode_unicode(
             soup.find(text=re.compile(".*compensation.*")).parent.findChild("b").text
         ).replace("$", "")
     except:
         pass
     try:
         posting["description"] = PostingScraper._encode_unicode(soup.find("section", {"id": "postingbody"}).text)
     except:
         pass
     try:
         posting["date_posted"] = soup.find("time")["datetime"]
     except:
         pass
     try:
         posting["unique_id"] = "clgig" + re.match("[^\d]*(\d+).html", url).group(1)
     except:
         pass
     return posting

예제 #4

0

파일 보기

파일: upwork_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_upwork_posting(self, posting_soup):
     """
     Given an Upwork article HTML object, extract the desired information and return as a dict
     :param posting_soup: the Soup-ed HTML
     :return: the data in a dict
     """
     posting = Posting({'source': self.source})
     # url
     try:
         url = posting_soup.find('meta', attrs=UpworkScraper._job_url_attrs)
         if url is not None:
             posting['url'] = self._clean_post_url(url['content'])
             posting['title'] = PostingScraper._encode_unicode(url.text)
             url_parts = urlsplit(posting['url'])
             if url_parts.path:
                 sections = url_parts.path.rsplit('/')
                 [sections.remove(s) for s in sections if not s]
                 posting['unique_id'] = sections[-1]
     except (KeyError, AttributeError):
         # traceback.print_exc(file=stderr)
         pass
     container = posting_soup.find(attrs=UpworkScraper._job_container_attrs)
     # date posted
     date_posted_span = container.find(attrs=UpworkScraper._job_dateposted_attrs)
     try:  # it's in the 'popover' attribute
         posting['date_posted'] = safe_dt_parse(date_posted_span['popover'])
     except (KeyError, AttributeError, ValueError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # price
     # second row of container, first element, first row inside that
     try:
         second_row = container.findAll('div', attrs=UpworkScraper._div_row_attrs)[1]
         try:
             first_child = second_row.find('div')
             price_row = first_child.find('div', attrs=UpworkScraper._div_row_attrs)
             try:
                 posting['price_info'] = PostingScraper._encode_unicode(price_row.text)
             except AttributeError:  # thrown if price_row is None
                 # traceback.print_exc(file=stderr)
                 pass
         except IndexError:  # thrown if second_row doesn't have a first_child
             # traceback.print_exc(file=stderr)
             pass
     except IndexError:  # thrown if container doesn't have a second 'row' tag
         # traceback.print_exc(file=stderr)
         pass
     # text
     try:
         description_air_card = container.find('div', attrs=UpworkScraper._job_descrip_aircard_attrs)
         posting['description'] = PostingScraper._encode_unicode(description_air_card.find('p').text)
     except AttributeError:  # handle if soup finds nothing
         # traceback.print_exc(file=stderr)
         pass
     # skills
     try:
         posting['skills'] = map(lambda x: PostingScraper._encode_unicode(x.text),
                                 container.findAll('a', attrs=UpworkScraper._job_skill_tag_attrs))
     except AttributeError:  # handle if soup finds nothing for skills
         pass
     # unique id
     return posting

예제 #5

0

파일 보기

파일: ziprecruiter_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_ziprecruiter_result(self, job_link):
     # print >> stderr, job_link
     soup = PostingScraper._get_cleaned_soup_from_url(job_link)
     if not len(soup):
         # print >> stderr, "returning none, soup false, link:%s" % job_link
         return None
     posting = Posting()
     # url
     try:
         url_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_ogurl_attrs)
         url = url_meta['content']
         posting.add_url(url)
         # id is the last series of alphanumeric characters after the last hyphen in the url path
         # e.g., /jobs/proedit-inc-18374379/contract-scientific-writer-2ccbf90f
         # would mean 2ccbf90f
         things = urlsplit(url)
         path = things[2]
         last_hyphen = path.rfind('-')
         if last_hyphen != -1:
             # print >> stderr, path[last_hyphen+1:]
             posting.add_id(path[last_hyphen+1:])
         else:
             # just take the whole url after the base
             # print >> stderr, "couldn't find id in url:%s" % url
             # print >> stderr, "making id:", path
             posting.add_id(path)
     except (TypeError, IndexError):
         # traceback.print_exc(file=stderr)
         pass
     # source
     posting.add_source(self.source)
     # title
     try:
         title_h1 = soup.find('h1', attrs=ZipRecruiterScraper._job_posting_h1_title_attrs)
         posting.add_title(PostingScraper._encode_unicode(title_h1.text))
     except AttributeError:  # if title_h1 is None
         # traceback.print_exc(file=stderr)
         pass
     # location
     try:
         # try to do the following first, more exact
         geoloc_meta = soup.find('meta', attrs=ZipRecruiterScraper._job_posting_latlong_attrs)['content']
         geoloc_meta = re.sub(";", ",", geoloc_meta)
         g = geocoder.google(geoloc_meta.split(','), method='reverse')
         # print >> stderr, "reverse by latlong, loc:%s => g:%s" % (geoloc_meta, str(g))
         posting['state'] = g.state
         posting['city'] = g.city
         posting['country'] = g.country
     except (TypeError, AttributeError, KeyError):
         # try to find the google map link
         try:
             maps_url = soup.find(href=re.compile(r"google\.com/maps|maps\.google\.com"))['href']
             things = urlsplit(maps_url)
             params = things[3]
             loc = parse_qs(params)
             loc_str = loc['q'][0]
             g = geocoder.google(loc_str)
             # print >> stderr, "normal by loc:%s => g:%s" % (loc_str, g)
             posting['state'] = g.state
             posting['city'] = g.city
             posting['country'] = g.country
         except (TypeError, AttributeError, KeyError, IndexError):
             # traceback.print_exc(file=stderr)
             pass
     # date posted
     try:
         try:  # try this first, but if we fail...
             date_posted_p = soup.find('p', attrs=ZipRecruiterScraper._job_posting_p_date_posted_attrs)
             # find first 'span'
             date_posted_span = date_posted_p.find('span')
             dt_text = re.sub(r"[Pp]osted", "", date_posted_span.text.lower()).strip()
             posting.add_date_posted(safe_dt_parse(dt_text))
         except AttributeError:
             try:
                 # ... double-check that we have a 'posted today / this week / 12 hours ago / whatever'
                 locdiv = soup.find('div', attrs=ZipRecruiterScraper._job_posting_locdiv_attrs)
                 header_div = locdiv.parent
                 date_p = header_div.findAll('p')[-1]
                 date_span = date_p.find('span')
                 date_span_text = date_span.text.lower()
                 if "posted" in date_span_text:
                     # and if so, take appropriate action
                     dt_text = re.sub(r"posted", "", date_span_text).strip()
                     posting.add_date_posted(safe_dt_parse(dt_text))
                 else:
                     # print >> stderr, "don't have a date found at url:%s" % job_link
                     pass
             except (AttributeError, IndexError):
                 # traceback.print_exc(file=stderr)
                 pass
     except ValueError as e:
         print >> stderr, "error parsing date posted string at url:%s" % job_link
         pass
     # description
     try:
         description_div = soup.find('div', attrs=ZipRecruiterScraper._job_posting_div_description_attrs)
         posting.add_description(PostingScraper._encode_unicode(description_div.text))
     except AttributeError:
         # traceback.print_exc(file=stderr)
         pass
     return posting

예제 #6

0

파일 보기

파일: simplyhired_scraper.py 프로젝트: themikesmith/web_scraperz

 def _get_info_from_simplyhired_result(self, result_soup):
     posting = Posting({'source': self.source})
     # external url, title
     try:
         title_link = result_soup.find('a', attrs=SimplyhiredScraper._job_title_link_attrs)
         posting['external_url'] = self._clean_post_url(title_link['href'])
         posting['title'] = PostingScraper._encode_unicode(title_link.text)
     except (AttributeError, KeyError, TypeError):
         # traceback.print_exc(file=stderr)
         pass
     # url
     description_page_url = None
     try:
         tools_container = result_soup.find('div', attrs=SimplyhiredScraper._job_tools_container_attrs)
         tools_links = tools_container.findAll('a')
         description_page_url = tools_links[-1]['href']
         posting['url'] = description_page_url
         posting['unique_id'] = description_page_url  # TODO i couldn't actually find a unique id?
     except (KeyError, AttributeError, IndexError):
         # traceback.print_exc(file=stderr)
         pass
     if description_page_url is not None:
         # follow posting url to long description, and date posted
         description_page_soup = PostingScraper._get_cleaned_soup_from_url(description_page_url)
         try:
             info_table = description_page_soup.find('table',
                                                     attrs=SimplyhiredScraper._description_page_info_table_attrs)
             # 4 rows: Company, Location, Date Posted, Source
             row_data_two = []
             trs = info_table.findAll('tr')
             for tr in trs:
                 tds = tr.findAll('td')
                 try:
                     last_td = tds[-1]
                     row_data_two.append(PostingScraper._encode_unicode(last_td.text))
                 except IndexError:
                     # traceback.print_exc(file=stderr)
                     pass
             info_labels = info_table.findAll('td', attrs=SimplyhiredScraper._description_page_table_info_label_attrs)
             info_labels = map(lambda x: PostingScraper._encode_unicode(x.text).lower(), info_labels)
             table_data = zip(info_labels, row_data_two)
             for label, value in table_data:
                 if not value.strip():
                     continue
                 if 'location' in label:
                     try:
                         g = geocoder.google(value, method='reverse')
                         posting['location'] = (g.city, g.state, g.country)
                     except Exception:
                         # traceback.print_exc(file=stderr)
                         pass
                 elif 'date posted' in label:
                     try:
                         posting['date_posted'] = safe_dt_parse(value)
                     except (AttributeError, ValueError):
                         # traceback.print_exc(file=stderr)
                         pass
                 elif 'source' in label:
                     posting['external_source'] = value
                 elif 'company' in label:
                     posting['company'] = value
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
         # description
         try:
             description_div = description_page_soup.find('div', attrs=SimplyhiredScraper._description_page_description_attrs)
             posting['description'] = PostingScraper._encode_unicode(description_div.text)
         except AttributeError:
             # traceback.print_exc(file=stderr)
             pass
     return posting