/
indeed_scraper.py
102 lines (96 loc) · 4.09 KB
/
indeed_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from urllib import quote_plus
import re
import geocoder
from super_dt_parser import safe_dt_parse
from posting_class import Posting
from posting_scraper import PostingScraper
from sys import stderr
from urlparse import urlunsplit
import traceback
__author__ = 'mcs'
class IndeedScraper(PostingScraper):
_job_results_col_attrs = {"id": "resultsCol"}
_job_row_result_div_attrs = {"class": re.compile(r"row.*result")}
_job_location_span_attrs = {"class": "location"}
_job_date_span_attrs = {"class": "date"}
_job_description_span_attrs = {"itemprop": "description"}
_job_summary_span_attrs = {"class": "summary"}
def __init__(self, location=""):
PostingScraper.__init__(self, "http://www.indeed.com")
if location:
self.location = location
else:
self.location = ""
def _get_info_from_indeed_result(self, row_result_soup):
posting = Posting({'source': self.source})
# url, title
try:
url_data = row_result_soup.find('a')
posting['url'] = self._clean_post_url(url_data['href'])
posting['title'] = PostingScraper._encode_unicode(url_data.text)
except (AttributeError, KeyError, TypeError):
pass
# id
try:
posting['unique_id'] = row_result_soup['data-jk']
except KeyError:
pass
# location
try:
loc = row_result_soup.find('span', IndeedScraper._job_location_span_attrs).text
if loc is not None:
g = geocoder.google(loc, method='reverse')
posting['location'] = (g.city, g.state, g.country)
except (AttributeError, Exception):
# traceback.print_exc(file=stderr)
pass
# date posted logic
try:
date_posted_span = row_result_soup.find('span', attrs=IndeedScraper._job_date_span_attrs)
date_posted_text = PostingScraper._encode_unicode(date_posted_span.text).lower()
if date_posted_text == "just posted":
date_posted_text = 'now'
try:
posting['date_posted'] = safe_dt_parse(date_posted_text) # also throws AttributeError
except (AttributeError, ValueError):
# traceback.print_exc(file=stderr)
pass
except AttributeError:
# traceback.print_exc(file=stderr)
pass
# description
try:
posting['description'] = PostingScraper._encode_unicode(
row_result_soup.find("span", attrs=IndeedScraper._job_description_span_attrs).text)
except AttributeError:
# traceback.print_exc(file=stderr)
pass
if 'description' not in posting: # try summary instead
try:
posting['description'] = PostingScraper._encode_unicode(
row_result_soup.find('span', attrs=IndeedScraper._job_summary_span_attrs).text
)
except AttributeError:
# traceback.print_exc(file=stderr)
pass
return posting
def get_postings(self, query, pages=1):
try:
query = quote_plus(query)
postings = []
for page_num in range(1, pages+1):
search_url = urlunsplit((self.scheme, self.source, "jobs",
"q=%s&l=%s&sort=date&start=%d" % (query, self.location, (page_num-1)*10), ""))
print >> stderr, search_url
soup = PostingScraper._get_cleaned_soup_from_url(search_url)
job_results_td = soup.find('td', attrs=IndeedScraper._job_results_col_attrs)
try:
postings.extend(job_results_td.findAll('div', IndeedScraper._job_row_result_div_attrs))
except AttributeError:
# traceback.print_exc(file=stderr)
pass
return list(set(PostingScraper._remove_none_from_things(
map(self._get_info_from_indeed_result, postings))))
except Exception:
traceback.print_exc(file=stderr)
return []