@route('/find/recommend') class _(Handler): def get(self): now_id = int(self.get_argument("id", 0)) page = int(self.get_argument("pi", 0)) if now_id: for link in self.extract_all('<h3 class="nickname">', '</h3>'): link = extract('"/', '"', link) spider.put("http://xianguo.com/" + link) if page == 0: page_list = set( self.extract_all("href=\"/find/recommend?pi=", "&")) for i in map(int, page_list): if page: spider.put( "http://xianguo.com/find/recommend?id=%s&pi=%s" % (now_id, page)) else: for id in self.extract_all('href="/find/recommend?id=', '"'): spider.put("http://xianguo.com/find/recommend?id=%s&pi=0" % id) if __name__ == '__main__': URL = 'http://xianguo.com/find/recommend' spider.put(URL) #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
doc_id = collection.insert(item) # add a document to the index solrConnection.add(id = doc_id, title = item.get('title'), description = item.get('description'), subtitle = item.get('subtitle'), information = item.get('information')) # commit to solr solrConnection.commit() @route('/images/.+') class pic(Handler): def get(self): save_pic(self.html, route.path.split('/')[-1]) @route('/wp-content/uploads/.+') class pic2(Handler): def get(self): save_pic(self.html, route.path.split('/')[-1]) def save_pic(content, fname): basepath = join(PREFIX, 'images') fpath = join(basepath, fname) f = open(fpath, 'wb') f.write(content) f.close() print 'Download image: ' + fname if __name__ == '__main__': spider.put('http://www.durex.com.cn/products') spider.run(5,100) item.writedb()
html = html[:html.rfind('</div>')] tid = int(self.get_argument('tid')) print tid, name self.page.append((tid, self.request.url, name, html)) @classmethod def write(cls): page = cls.page page.sort(key=itemgetter(0), reverse=True) with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss: rss.write( cls.template.render( rss_title='经济学人 . 中文网', rss_link='http://www.ecocn.org', li=[ dict( link=link, title=title, txt=txt ) for id, link, title, txt in cls.page ] ) ) if __name__ == '__main__': spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(3, 30) forum.write()
html = self.extract('<div class="t_fsz">', '<div id="comment_') html = html[:html.rfind('</div>')] tid = int(self.get_argument('tid')) print tid, name self.page.append((tid, self.request.url, name, html)) @classmethod def write(cls): page = cls.page page.sort(key=itemgetter(0), reverse=True) with open(join(PREFIX, 'ecocn_org.xml'), 'w') as rss: rss.write( cls.template.render( rss_title='经济学人 . 中文网', rss_link='http://www.ecocn.org', li=[ dict( link=link, title=title, txt=txt ) for id, link, title, txt in cls.page ] ) ) if __name__ == '__main__': spider.put('http://www.ecocn.org/portal.php?mod=list&catid=1') #10个并发抓取线程 , 网页读取超时时间为30秒 spider.run(10, 30)
if not self.indexes.has_key (link): print 'link_index', link self.indexes[link] = None spider.put (link) template = unicode("""--- title: %s date: %s categories: %s --- """, 'utf-8') if __name__ == '__main__': spider.put ("http://%s/index_1.html" % (site)) # spider.put ("http://frostyplanet.blogbus.com/c1566502/") spider.run (5, 5) if not os.path.exists (os.path.join (PREFIX, "output")): os.mkdir ("output") for _id, title, times, category, html in blog_post.posts: file_path = os.path.join (os.path.join (PREFIX, "output", str(category) + "_" + str(_id))) content = template % (title, times, category) + unicode(html, 'utf-8') f = open (file_path, "w") try: f.write (content.encode ('utf-8')) finally: f.close () print "wrote", file_path
array_files = output.split('|') return array_files def get_volume_name(self): volume_name = VOLUMES_NAME_DICT[self.request.url] return volume_name def get(self): for segment in self.extract_all('<script>var s_files=\"', '</script>'): segment_list = segment.split('";') s_files = segment_list[0] # http://www.blgl8.com/script/ds/ds.js s_ds = 'http://comic.1mh.in:2813/' s_path = segment_list[1].split('="')[1] array_files = self.get_array_files(s_files) volume_name = self.get_volume_name() for pic_file in array_files: # print volume_name + s_ds + s_path + pic_file wget_command = 'wget ' + s_ds + s_path + pic_file + \ " -P " + volume_name print wget_command # status, output = commands.getstatusoutput(wget_command) # print status, output if __name__ == '__main__': # spider.put('http://www.blgl8.com/comic-i/blgl44469/') spider.put('http://www.blgl8.com/comic-i/blgl39895/') spider.run(1, 5)
number = filter(lambda ch: ch in "0123456789", li.text) bracket_index = li.text.index("(") brand_name = li.text[0:bracket_index] brand_list.append(brand_name + " " + number) print brand_name + ":" + number with open(join(PREFIX, "brands.txt"), "w") as file: for brand in brand_list: file.write("%s\n" % brand) file.close() page_size = 20 page_num = 0 if int(number) % page_size != 0: page_num = int(number) / page_size + 1 else: page_num = int(number) / page_size for pn in range(page_num): page = pn + 1 url = link + "pn" + str(page) print "Fetch data from: " + HTTP % url spider.put(HTTP % url) if __name__ == "__main__": # spider.put('http://www.yeedou.com/anquantao-c2441/') readHtml() spider.run(5, 3000) item.writedb()