forked from thinker007/scrapely-hack
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
60 lines (51 loc) · 2.41 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import urllib, scraperwiki
try:
import json
except ImportError:
import simplejson as json
from scrapely.htmlpage import HtmlPage
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor
from scrapely import Scraper
class Scraper2(Scraper):
def train(self, url=None, data=None, html=None, encoding='utf-8'):
assert data, "Cannot train with empty data"
page = self._get_page(url, encoding, html)
tm = TemplateMaker(page)
for field, values in data.items():
if not hasattr(values, '__iter__'):
values = [values]
for value in values:
if isinstance(value, str):
value = value.decode(encoding)
tm.annotate(field, best_match(value))
self.templates.append(tm.get_template())
def scrape(self, url=None, html=None, encoding='utf-8'):
## not version from https://github.com/scrapy/scrapely/blob/master/scrapely/extraction/pageparsing.py
## may need to replace with version from inspect.getsourcelines(Scraper.scrape), as this version is
page = self._get_page(url, encoding, html)
ex = InstanceBasedLearningExtractor(self.templates)
return ex.extract(page)[0]
@staticmethod
def _get_page(url=None, encoding=None, html=None):
if html:
body=html.decode(encoding)
else:
body = urllib.urlopen(url).read().decode(encoding)
return HtmlPage(url, body=body, encoding=encoding)
### Basic usage:
### s=Scraper2()
### data = {'name':'value pairs'} # like normal Scrapely
### s.train(data=data, html=string_of_html)
### output = s.scrape(html=different_string_of_html)
### Everything below this line is an example from https://github.com/scrapy/scrapely, with edits.
s = Scraper2() # note how we're *not* using Scraper() - this uses our custom version
url1 = 'http://pypi.python.org/pypi/w3lib'
html1 = scraperwiki.scrape(url1) # get the HTML - this could be the output from mechanize
data = {'name': 'w3lib 1.0', 'author': 'Scrapy project', 'description': 'Library of web-related functions'}
s.train(data) # pick one of these lines
#s.train(url1,data)
url2 = 'http://pypi.python.org/pypi/Django/1.3'
html2 = scraperwiki.scrape(url2)
print s.scrape(None, html2) # pick one of these lines.
#print s.scrape(url2)