def fetch(self): self.logger.info("Start fetching data from www.360buy.com...") parser = etree.HTMLParser(encoding='gbk') text = urllib2.urlopen(LIST_URL).read(-1) tree = etree.HTML(text, parser=parser) time = datetime.datetime.now().date().strftime('%Y-%m-%d') nodes = tree.xpath(LIST_XPATH) for node in nodes: node1 = node.find(TITLE_PATH) #print etree.tostring(node, method='html', encoding='utf-8') title = node1.attrib['title'] url = node1.attrib['href'] node2 = node.find(PREVIEW_PATH) preview = node2.attrib['src'] node3 = node.find(PRICE_PATH) price = node3.text self.logger.info("%s: %s - %s" % (time, title, url)) self.logger.info("%s - %s" % (price, preview)) object_found.send(self, time=time, title=title, url=url, preview=preview ,price=price)
def _get_objects_from_url(self, url): objects = [] parser = etree.HTMLParser(encoding='gbk') text = urllib2.urlopen(url).read(-1) tree = etree.HTML(text, parser=parser) nodes = tree.xpath(LIST_XPATH) for node in nodes: try: title_node = node.find('li[1]/a') time_node = node.find('li[2]') url_node = node.find('li[3]/a') if url_node is None: continue new_url = urlparse.urljoin(url, url_node.attrib['href']).replace('../', '') text = urllib2.urlopen(new_url).read(-1).decode('gbk') time = datetime.datetime.strptime(time_node.text, "%Y-%m-%d") title = title_node.text title_parts = title.split(' ') contest = title_parts[0] contest_info = title_parts[1] object_got = { 'time': time, 'title': title, 'url': new_url, } print " Object retrieved: %s" % object_got['title'] self.logger.info("%s: %s - %s" % (object_got, title, new_url)) objects.append(object_got) object_found.send(self, time=time, title=title, url=new_url, check=True, contest=contest, info=contest_info, qipu=text) except: pass return objects