def get_postings(self, query, pages=1): try: query = quote_plus(query) postings = [] for page_num in range(1, pages + 1): search_url = urlunsplit( ( self.scheme, self.source, "jobs", "q=%s&l=%s&sort=date&start=%d" % (query, self.location, (page_num - 1) * 10), "", ) ) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) job_results_td = soup.find("td", attrs=IndeedScraper._job_results_col_attrs) try: postings.extend(job_results_td.findAll("div", IndeedScraper._job_row_result_div_attrs)) except AttributeError: # traceback.print_exc(file=stderr) pass return list(set(PostingScraper._remove_none_from_things(map(self._get_info_from_indeed_result, postings)))) except Exception: traceback.print_exc(file=stderr) return []
def get_postings(self, query, pages=1): postings = [] try: query = re.sub(' ', '-', query) # funny syntax for guru website query = quote_plus(query) for page_num in range(1, pages + 1): search_url = urlunsplit((self.scheme, self.source, "d/jobs/q/%s/pg/%d" % (query, page_num), "", "")) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) services_list = soup.find(attrs=GuruScraper._job_search_results_list_attr) try: # handle if there are more pages than results... services_list won't exist for i, li in enumerate(services_list.findAll('li', attrs=GuruScraper._job_search_result_list_item_attrs)): h2 = li.find('h2', attrs=GuruScraper._job_search_results_header_attrs) a = h2.find('a') postings.append(self._clean_post_url(a['href'])) except (AttributeError, TypeError, KeyError): # also handle misc errors, want to gracefully return postings we already have # traceback.print_exc(file=stderr) pass return list(set(PostingScraper._remove_none_from_things( map(self._get_info_from_guru_job_page_soup, map(PostingScraper._get_cleaned_soup_from_url, postings))))) except Exception: traceback.print_exc(file=stderr) return []
def get_postings(self, query, pages=1): try: postings = [] query = quote_plus(query) for page_num in range(1, pages+1): # https://www.simplyhired.com/search?q=massage+therapist&l=baltimore%2C+md&pn=2 search_url = urlunsplit((self.scheme, self.source, "search", "q=%s&l=%s&pn=%d" % (query, quote_plus(self.location.lower()), page_num), "")) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) job_results_list_div = soup.find('div', attrs=SimplyhiredScraper._job_results_list_div_attrs) try: postings.extend(job_results_list_div.findAll('div', SimplyhiredScraper._job_result_div_attrs)) except AttributeError: # traceback.print_exc(file=stderr) pass return list(set(PostingScraper._remove_none_from_things( map(self._get_info_from_simplyhired_result, postings)))) except Exception: traceback.print_exc(file=stderr) return []
def get_postings(self, query, pages=1): try: query = quote_plus(query) # don't use urlencode, some sites depend on argument order posts = [] # temporary variable to store all of the posting data for i in range(1, pages + 1): search_url = urlunsplit( (self.scheme, self.source, "/search/ggg", "query=%s&sort=date?s=%d" % (query, i * 100), "") ) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) try: posts += [ self._clean_post_url(a["href"]) for a in soup.findAll("a", {"data-id": re.compile("\d+")}) ] except KeyError: # handle if no href pass return list( set(PostingScraper._remove_none_from_things([self._get_info_from_clp_posting(post) for post in posts])) ) except Exception: traceback.print_exc(file=stderr) return []
def get_postings(self, query, pages=1): try: query = quote_plus(query) # don't use urlencode, some sites depend on argument order posts = [] # example url: # https://www.upwork.com/o/jobs/browse/?q=therapist for i in range(1, pages + 1): search_url = urlunsplit((self.scheme, self.source, "/o/jobs/browse/", "page=%d&q=%s" % (i, query), "")) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) # this url returns a list of postings of profiles. visit each profile for article in soup.findAll('article'): # get all 'article' url = article.find('a', attrs=UpworkScraper._job_search_result_link_attrs) try: posts.append(self._clean_post_url(url['href'])) except (TypeError, KeyError): pass return list(set(PostingScraper._remove_none_from_things( map(self._get_info_from_upwork_posting, map(PostingScraper._get_cleaned_soup_from_url, posts))))) except Exception: traceback.print_exc(file=stderr) return []
def get_postings(self, query, pages=1): try: postings = [] query = quote_plus(query) for page_num in range(1, pages+1): # https://www.ziprecruiter.com/candidate/search?sort=best-match&search=writer&page=2&location=baltimore search_url = urlunsplit((self.scheme, self.source, "candidate/search", "sort=best-match&search=%s&location=%s&page=%d" % (query, quote_plus(self.location.lower()), page_num), "")) print >> stderr, search_url soup = PostingScraper._get_cleaned_soup_from_url(search_url) job_results_list_div = soup.find('div', attrs=ZipRecruiterScraper._job_list_div_attrs) try: postings.extend(map(lambda x: x['href'], job_results_list_div.findAll('a', ZipRecruiterScraper._job_result_link_attrs))) except AttributeError: # traceback.print_exc(file=stderr) pass # note that we could return None if we go to an external url here postings = map(self._get_info_from_ziprecruiter_result, postings) return list(set(PostingScraper._remove_none_from_things(postings))) except Exception: traceback.print_exc(file=stderr) return []