def parse_page(self, url): ''' Opens and parses the given URL with ``lxml``. :returns: a tuple with 3 elements: 1. the page title 2. content of the page 3. located URLs on this page (absolute and normalized) ''' response = self.opener.open(url) ctype = parse_content_type(response) if not ctype == 'text/html': raise URLError('Wrong Content-Type: "%s"' % ctype) doc = html.parse(response).getroot() if doc is None: return None, None, None try: title = doc.xpath("//title/text()")[0].encode('utf-8') except IndexError: title = None content = doc.text_content().encode('utf-8') links = set() doc.make_links_absolute() for _, _, link, _ in doc.iterlinks(): url = normalize_url(link.encode('utf-8')) if url: links.add(url) return title, content, links
def __init__(self, urls, number_crawlers=NUMBER_CRAWLERS): self.crawlers = list() self.lock = Lock() # begin: shared data self.hosts = dict() self.urls = set() self.handled_urls = set() self.invalid_urls = set() # end: shared data self.add_urls((normalize_url(url) for url in urls)) self.number_crawlers = number_crawlers self.start = 0.0 self.stopping = False self.conn = MongoConnector() for page in self.conn.db.pages.find(): self.handled_urls.add(page['url']) self.num_previous_urls = len(self.handled_urls) print 'Admin initialized.'