def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''): Callbacks.__init__(self) self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK]) self.titles = {} self.links = {} self.authors = {} self.callbacks = { 'http://www.duokan.com/special': self.findBooks, 'http://www.duokan.com/book': self.findBook, 'http://www.duokan.com': self.findLinks, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree, } self.spider = Spider('Duokan Special') if len(proxyHost) > 0: self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd) self.spider.add_callbacks(self.callbacks) self.spider.add_urls([ Special.siteRoot, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA' ])
def __init__(self, start='', end=''): self.TAG = WsjImg.__name__ self.init_date(start, end) self.db = WsjPersist() self.callbacks = { 'http://cn.wsj.com/gb/pho.asp': self.find_links, 'http://cn.wsj.com/gb/20': self.parse_page, 'http://cn.wsj.com/pictures/photo/': self.save_img } self.spider = Spider('WsjImg') self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7') self.spider.add_callbacks(self.callbacks) self.spider.add_urls(self.starts) self.spider.start()
def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''): Callbacks.__init__(self) self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK]) self.titles = {} self.links = {} self.authors = {} self.callbacks = {'http://www.duokan.com/special': self.findBooks, 'http://www.duokan.com/book': self.findBook, 'http://www.duokan.com': self.findLinks, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree, } self.spider = Spider('Duokan Special') if len(proxyHost) > 0: self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd) self.spider.add_callbacks(self.callbacks) self.spider.add_urls([Special.siteRoot, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA' ])
class Special(Callbacks): siteRoot = 'http://www.duokan.com' (EVT_FIND_LINK, EVT_FIND_BOOK) = range(0, 2) def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''): Callbacks.__init__(self) self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK]) self.titles = {} self.links = {} self.authors = {} self.callbacks = {'http://www.duokan.com/special': self.findBooks, 'http://www.duokan.com/book': self.findBook, 'http://www.duokan.com': self.findLinks, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree, } self.spider = Spider('Duokan Special') if len(proxyHost) > 0: self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd) self.spider.add_callbacks(self.callbacks) self.spider.add_urls([Special.siteRoot, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA' ]) def findLinks(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') list_nodes = self.soup.findAll('div', attrs={'class': 'u-aimg'}) if len(list_nodes) > 0: list_node = list_nodes[0] links = list_node.findAll('a') # limit free read link = links[0] link = [Special.siteRoot + link['href']] self.spider.add_urls(link) self.dispatch(Special.EVT_FIND_LINK, link[0]) # limit free buy # link = links[2] # link = [Special.siteRoot + link['href']] # self.spider.add_urls(link) def finfLimitFree(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') list_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'}) if len(list_nodes) > 0: list_node = list_nodes[0] links = list_node.findAll('a') # limit free read link = links[0] link = [Special.siteRoot + link['href']] self.spider.add_urls(link) self.dispatch(Special.EVT_FIND_LINK, link[0]) def findBooks(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') book_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'}) for item in book_nodes: id = item['data-id'] if id: title = item.find('a', attrs={'class': 'title'}).string link = item.find('a', attrs={'class': 'title'})['href'] author = item.find('div', attrs={'class': 'u-author'}).find('span').string self.titles[id] = title self.links[id] = Special.siteRoot + link self.authors[id] = author self.dispatch(Special.EVT_FIND_BOOK, id, self.titles[id], self.authors[id], self.links[id]) return self.titles def findBook(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') # id # content = self.soup.find('meta', attrs={'name':'apple-itunes-app'})['content'].split('/') # id = content[len(content) - 1] # title # descNode = self.soup.findAll('div', attrs={'class':'desc'}) # title = descNode[0].find('h3').string # author author = '' # author = descNode[0].find('td', attrs={'class':'author'}).find('a').string # link # link = self.soup.find('div', attrs={'class':'cover', 'id':'cover-img'}).find('a')['href'] # link = DuokanSpecial.siteRoot + link # self.dispatch(DuokanSpecial.ON_FIND_BOOK, id, title, author, link) scriptNodes = self.soup.findAll('script', attrs={'type': 'text/javascript'}) for node in scriptNodes: str = node.string if str: if str.find('window.dk_data') > 0: start = str.index('=') + len('=') end = str.index('window.dk_data.comments_url') str = str[start:end] # str = str.strip().lstrip() str = str.replace('book_id :', '\'book_id\' :') str = str.replace('book :', '\'book\' :') str = str.replace('sid :', '\'sid\' :') str = str.replace('id :', '\'id\' :') str = str.replace('title : ', '\'title\' : u') str = str.replace('old_price :', '\'old_price\' :') str = str.replace('price :', '\'price\' :') str = str.replace('cover :', '\'cover\' :') str = str.replace('url :', '\'url\' :') str = str.replace('webreader :', '\'webreader\' :') str = str.replace('limited_time :', '\'limited_time\' :') str = str.replace('authors : ', '\'authors\' : u') # print str dk_data = eval(str) id = dk_data['book']['id'] title = dk_data['book']['title'] author = dk_data['book']['authors'] link = Special.siteRoot + dk_data['book']['url'] self.dispatch(Special.EVT_FIND_BOOK, id, title, author, link) def start(self): self.spider.start() def stop(self): self.spider.stop() def getTitle(self): return self.titles def getLinks(self): return self.links def getAuthors(self): return self.authors
class WsjImg: site_root = 'http://cn.wsj.com/' page_root = 'http://cn.wsj.com/gb/' img_root = 'http://cn.wsj.com/pictures/photo/' starts = ['http://cn.wsj.com/gb/pho.asp'] # starts = ['http://cn.wsj.com/gb/20141230/PHO094555.asp'] # callbacks = {'http://cn.wsj.com/gb/pho.asp':WsjImg.find_links, 'http://cn.wsj.com/gb/':WsjImg.parse_page, 'http://cn.wsj.com/pictures/photo/':WsjImg.save_img} # page url path # ['', 'gb', '20130528', 'PHO184538.asp'] idx_page_date = 2 idx_page_filename = 3 # img url path # ['', 'pictures', 'photo', 'BJ20141226094555', '01.jpg'] idx_img_dir = 3 idx_img_filename = 4 # persist DIR_BASE = 'base' DIR_ROOT = 'dat' DIR_IMG = 'img' def __init__(self, start='', end=''): self.TAG = WsjImg.__name__ self.init_date(start, end) self.db = WsjPersist() self.callbacks = { 'http://cn.wsj.com/gb/pho.asp': self.find_links, 'http://cn.wsj.com/gb/20': self.parse_page, 'http://cn.wsj.com/pictures/photo/': self.save_img } self.spider = Spider('WsjImg') self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7') self.spider.add_callbacks(self.callbacks) self.spider.add_urls(self.starts) self.spider.start() def init_date(self, strStart='', strEnd=''): '''Initiate start/end date''' self.strStart = strStart self.strEnd = strEnd def find_links(self, url, response): '''Parse the photos news default page and find photos news page urls''' Log.i(self.TAG, 'find links in %s' % url) links = ImgPageLinks(response, self.strStart, self.strEnd) urls = links.getLinks(response) # urls = links.persistToDB(self.db) self.spider.add_urls(urls) def parse_page(self, url, response): '''Parse photos news page, find content and image urls, also with other photos news page urls.''' # find img page links self.find_links(url, response) # process image page. imgPage = ImgPage(url, response) imgPage.clear() imgPage.parseImgUrls() if len(imgPage.imgUrls.keys()) > 1: imgPage.save(os.path.join(WsjImg.DIR_ROOT, imgPage.filePath)) with open(os.path.join(WsjImg.DIR_ROOT, imgPage.data['path']), 'w') as f: f.write(json.dumps(imgPage.data)) imgPage.persistToDB(self.db) self.db.updateArt(url, imgPage.title, imgPage.summary) # save imgs of the page self.save_imgs(imgPage) # copy base files to here # os.system('cp -a %s/* %s/' % (WsjImg.dir_base, os.path.join(WsjImg.dir_root, page_date))) self.spider.fetch.copyall(WsjImg.DIR_BASE, os.path.join(WsjImg.DIR_ROOT, imgPage.pageDate)) else: print 'no link find in %s' % url def save_img(self, url, response): print 'ignore %s' % url def save_imgs(self, imgPage): for url in imgPage.imgUrls.keys(): dstfile = os.path.join(WsjImg.DIR_ROOT, imgPage.imgUrls[url]['path']) self.spider.download(url, dstfile)
class Special(Callbacks): siteRoot = 'http://www.duokan.com' (EVT_FIND_LINK, EVT_FIND_BOOK) = range(0, 2) def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''): Callbacks.__init__(self) self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK]) self.titles = {} self.links = {} self.authors = {} self.callbacks = { 'http://www.duokan.com/special': self.findBooks, 'http://www.duokan.com/book': self.findBook, 'http://www.duokan.com': self.findLinks, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree, } self.spider = Spider('Duokan Special') if len(proxyHost) > 0: self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd) self.spider.add_callbacks(self.callbacks) self.spider.add_urls([ Special.siteRoot, # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA' ]) def findLinks(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') list_nodes = self.soup.findAll('div', attrs={'class': 'u-aimg'}) if len(list_nodes) > 0: list_node = list_nodes[0] links = list_node.findAll('a') # limit free read link = links[0] link = [Special.siteRoot + link['href']] self.spider.add_urls(link) self.dispatch(Special.EVT_FIND_LINK, link[0]) # limit free buy # link = links[2] # link = [Special.siteRoot + link['href']] # self.spider.add_urls(link) def finfLimitFree(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') list_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'}) if len(list_nodes) > 0: list_node = list_nodes[0] links = list_node.findAll('a') # limit free read link = links[0] link = [Special.siteRoot + link['href']] self.spider.add_urls(link) self.dispatch(Special.EVT_FIND_LINK, link[0]) def findBooks(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') book_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'}) for item in book_nodes: id = item['data-id'] if id: title = item.find('a', attrs={'class': 'title'}).string link = item.find('a', attrs={'class': 'title'})['href'] author = item.find('div', attrs={ 'class': 'u-author' }).find('span').string self.titles[id] = title self.links[id] = Special.siteRoot + link self.authors[id] = author self.dispatch(Special.EVT_FIND_BOOK, id, self.titles[id], self.authors[id], self.links[id]) return self.titles def findBook(self, url, response): self.soup = BeautifulSoup(response, from_encoding='utf8') # id # content = self.soup.find('meta', attrs={'name':'apple-itunes-app'})['content'].split('/') # id = content[len(content) - 1] # title # descNode = self.soup.findAll('div', attrs={'class':'desc'}) # title = descNode[0].find('h3').string # author author = '' # author = descNode[0].find('td', attrs={'class':'author'}).find('a').string # link # link = self.soup.find('div', attrs={'class':'cover', 'id':'cover-img'}).find('a')['href'] # link = DuokanSpecial.siteRoot + link # self.dispatch(DuokanSpecial.ON_FIND_BOOK, id, title, author, link) scriptNodes = self.soup.findAll('script', attrs={'type': 'text/javascript'}) for node in scriptNodes: str = node.string if str: if str.find('window.dk_data') > 0: start = str.index('=') + len('=') end = str.index('window.dk_data.comments_url') str = str[start:end] # str = str.strip().lstrip() str = str.replace('book_id :', '\'book_id\' :') str = str.replace('book :', '\'book\' :') str = str.replace('sid :', '\'sid\' :') str = str.replace('id :', '\'id\' :') str = str.replace('title : ', '\'title\' : u') str = str.replace('old_price :', '\'old_price\' :') str = str.replace('price :', '\'price\' :') str = str.replace('cover :', '\'cover\' :') str = str.replace('url :', '\'url\' :') str = str.replace('webreader :', '\'webreader\' :') str = str.replace('limited_time :', '\'limited_time\' :') str = str.replace('authors : ', '\'authors\' : u') # print str dk_data = eval(str) id = dk_data['book']['id'] title = dk_data['book']['title'] author = dk_data['book']['authors'] link = Special.siteRoot + dk_data['book']['url'] self.dispatch(Special.EVT_FIND_BOOK, id, title, author, link) def start(self): self.spider.start() def stop(self): self.spider.stop() def getTitle(self): return self.titles def getLinks(self): return self.links def getAuthors(self): return self.authors