-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
46 lines (43 loc) · 1.54 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import msgpack
from bs4 import BeautifulSoup, Tag, NavigableString
import re
import warnings
def load(paths):
for path in sorted(paths):
for doc in msgpack.unpack(file(path), encoding='utf-8'):
soup=BeautifulSoup(doc['_page'])
desc=soup.find(id='jobDescription').text
try:
date=parse_publication_time(doc['publication_time'])
except ValueError, err:
warnings.warn(str(err))
continue
yield dict(title=doc['name'], desc=desc,
url=doc['job_url'], company=doc['employer'],
location=doc['location_text'],
published=date)
def load_ilmo(path):
soup=BeautifulSoup(file(path))
title = None
desc = []
for el in soup.body.descendants:
if el.name == 'h1' and el.text.strip():
if title is not None:
assert desc
yield dict(title=title, desc=' '.join(desc))
title = el.text.strip()
desc = []
elif isinstance(el, NavigableString):
txt = unicode(el).strip()
if txt:
desc.append(txt)
if title is not None:
assert desc
yield dict(title=title, desc=' '.join(desc))
date_re = re.compile(r'^(\d\d?)\.(\d\d?)\.(\d\d\d\d)$')
def parse_publication_time(txt):
m = date_re.search(txt)
if m is None:
raise ValueError, 'parse error: %s', txt
day, month, year = map(int, m.groups())
return '%04d-%02d-%02d' % (year, month, day)