예제 #1
0
 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)
         postings = []
         for page_num in range(1, pages + 1):
             search_url = urlunsplit(
                 (
                     self.scheme,
                     self.source,
                     "jobs",
                     "q=%s&l=%s&sort=date&start=%d" % (query, self.location, (page_num - 1) * 10),
                     "",
                 )
             )
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_td = soup.find("td", attrs=IndeedScraper._job_results_col_attrs)
             try:
                 postings.extend(job_results_td.findAll("div", IndeedScraper._job_row_result_div_attrs))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(map(self._get_info_from_indeed_result, postings))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []
예제 #2
0
 def get_postings(self, query, pages=1):
     postings = []
     try:
         query = re.sub(' ', '-', query)  # funny syntax for guru website
         query = quote_plus(query)
         for page_num in range(1, pages + 1):
             search_url = urlunsplit((self.scheme, self.source, "d/jobs/q/%s/pg/%d" % (query, page_num), "", ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             services_list = soup.find(attrs=GuruScraper._job_search_results_list_attr)
             try:  # handle if there are more pages than results... services_list won't exist
                 for i, li in enumerate(services_list.findAll('li', attrs=GuruScraper._job_search_result_list_item_attrs)):
                     h2 = li.find('h2', attrs=GuruScraper._job_search_results_header_attrs)
                     a = h2.find('a')
                     postings.append(self._clean_post_url(a['href']))
             except (AttributeError, TypeError, KeyError):
                 # also handle misc errors, want to gracefully return postings we already have
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_guru_job_page_soup,
                 map(PostingScraper._get_cleaned_soup_from_url, postings)))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []
 def get_postings(self, query, pages=1):
     try:
         postings = []
         query = quote_plus(query)
         for page_num in range(1, pages+1):
             # https://www.simplyhired.com/search?q=massage+therapist&l=baltimore%2C+md&pn=2
             search_url = urlunsplit((self.scheme, self.source, "search",
                                      "q=%s&l=%s&pn=%d" % (query, quote_plus(self.location.lower()), page_num), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_list_div = soup.find('div', attrs=SimplyhiredScraper._job_results_list_div_attrs)
             try:
                 postings.extend(job_results_list_div.findAll('div', SimplyhiredScraper._job_result_div_attrs))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_simplyhired_result, postings))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []
 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)  # don't use urlencode, some sites depend on argument order
         posts = []  # temporary variable to store all of the posting data
         for i in range(1, pages + 1):
             search_url = urlunsplit(
                 (self.scheme, self.source, "/search/ggg", "query=%s&sort=date?s=%d" % (query, i * 100), "")
             )
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             try:
                 posts += [
                     self._clean_post_url(a["href"]) for a in soup.findAll("a", {"data-id": re.compile("\d+")})
                 ]
             except KeyError:  # handle if no href
                 pass
         return list(
             set(PostingScraper._remove_none_from_things([self._get_info_from_clp_posting(post) for post in posts]))
         )
     except Exception:
         traceback.print_exc(file=stderr)
         return []
예제 #5
0
 def get_postings(self, query, pages=1):
     try:
         query = quote_plus(query)  # don't use urlencode, some sites depend on argument order
         posts = []
         # example url:
         # https://www.upwork.com/o/jobs/browse/?q=therapist
         for i in range(1, pages + 1):
             search_url = urlunsplit((self.scheme, self.source, "/o/jobs/browse/", "page=%d&q=%s" % (i, query), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             # this url returns a list of postings of profiles. visit each profile
             for article in soup.findAll('article'):  # get all 'article'
                 url = article.find('a', attrs=UpworkScraper._job_search_result_link_attrs)
                 try:
                     posts.append(self._clean_post_url(url['href']))
                 except (TypeError, KeyError):
                     pass
         return list(set(PostingScraper._remove_none_from_things(
             map(self._get_info_from_upwork_posting,
                 map(PostingScraper._get_cleaned_soup_from_url, posts)))))
     except Exception:
         traceback.print_exc(file=stderr)
         return []
 def get_postings(self, query, pages=1):
     try:
         postings = []
         query = quote_plus(query)
         for page_num in range(1, pages+1):
             # https://www.ziprecruiter.com/candidate/search?sort=best-match&search=writer&page=2&location=baltimore
             search_url = urlunsplit((self.scheme, self.source, "candidate/search",
                                      "sort=best-match&search=%s&location=%s&page=%d" %
                                      (query, quote_plus(self.location.lower()), page_num), ""))
             print >> stderr, search_url
             soup = PostingScraper._get_cleaned_soup_from_url(search_url)
             job_results_list_div = soup.find('div', attrs=ZipRecruiterScraper._job_list_div_attrs)
             try:
                 postings.extend(map(lambda x: x['href'],
                                     job_results_list_div.findAll('a', ZipRecruiterScraper._job_result_link_attrs)))
             except AttributeError:
                 # traceback.print_exc(file=stderr)
                 pass
         # note that we could return None if we go to an external url here
         postings = map(self._get_info_from_ziprecruiter_result, postings)
         return list(set(PostingScraper._remove_none_from_things(postings)))
     except Exception:
         traceback.print_exc(file=stderr)
         return []