def get(self,page): for html in self.extract_all('<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/','<',html).split(">",1)[-1] link_html = extract('<div class="entry-content">','</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s"%id print title print txt print ""
def get(self, page): for html in self.extract_all( '<h2><a target="_blank" href="http://blog.jobbole.com', '<!-- .entry-content -->'): id = html[:html.find('"')] title = extract('/', '<', html).split(">", 1)[-1] link_html = extract('<div class="entry-content">', '</p>', html) link_html = extract('<p', None, link_html) txt = html2txt(link_html) if "http://" in txt: print "http://blog.jobbole.com%s" % id print title print txt print ""
def get(self, id): title = self.extract('<div class="beings-name">', '</div>') if not title: return title = unescape(title) link = self.extract('<div class="beings-website"><a href="', '"') if not link: return txt = self.extract('<div class="beings-description">', '</div>') if txt: txt = unescape(txt) img = extract('src="', '"', self.extract('<a class="avatar" href="/', '</a>')) print dumps([id, img, link, title, txt or ''])
def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">','</h3>'): link = extract('"/','"', link) spider.put("http://xianguo.com/"+link) if page == 0: page_list = set(self.extract_all("href=\"/find/recommend?pi=","&")) for i in map(int,page_list): if page: spider.put("http://xianguo.com/find/recommend?id=%s&pi=%s"%(now_id,page)) else: for id in self.extract_all( 'href="/find/recommend?id=', '"' ): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0"%id)
def get(self, id): title = self.extract('<div class="beings-name">','</div>') if not title: return title = unescape(title) link = self.extract('<div class="beings-website"><a href="','"') if not link: return txt = self.extract('<div class="beings-description">','</div>') if txt: txt = unescape(txt) img = extract( 'src="', '"', self.extract('<a class="avatar" href="/','</a>') ) print dumps([id, img, link, title, txt or ''])
def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">', '</h3>'): link = extract('"/', '"', link) spider.put("http://xianguo.com/" + link) if page == 0: page_list = set( self.extract_all("href=\"/find/recommend?pi=", "&")) for i in map(int, page_list): if page: spider.put( "http://xianguo.com/find/recommend?id=%s&pi=%s" % (now_id, page)) else: for id in self.extract_all('href="/find/recommend?id=', '"'): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id)