Exemplo n.º 1
0
    def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''):
        Callbacks.__init__(self)
        self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK])
        self.titles = {}
        self.links = {}
        self.authors = {}

        self.callbacks = {
            'http://www.duokan.com/special': self.findBooks,
            'http://www.duokan.com/book': self.findBook,
            'http://www.duokan.com': self.findLinks,
            # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree,
        }
        self.spider = Spider('Duokan Special')
        if len(proxyHost) > 0:
            self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd)
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls([
            Special.siteRoot,
            # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA'
        ])
Exemplo n.º 2
0
    def __init__(self, start='', end=''):
        self.TAG = WsjImg.__name__
        self.init_date(start, end)
        self.db = WsjPersist()

        self.callbacks = {
                'http://cn.wsj.com/gb/pho.asp': self.find_links, 
                'http://cn.wsj.com/gb/20': self.parse_page,
                'http://cn.wsj.com/pictures/photo/': self.save_img
        }
        self.spider = Spider('WsjImg')
        self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7')
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls(self.starts)
        self.spider.start()
Exemplo n.º 3
0
    def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''):
        Callbacks.__init__(self)
        self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK])
        self.titles = {}
        self.links = {}
        self.authors = {}

        self.callbacks = {'http://www.duokan.com/special': self.findBooks,
                          'http://www.duokan.com/book': self.findBook,
                          'http://www.duokan.com': self.findLinks,
                          # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree,
                          }
        self.spider = Spider('Duokan Special')
        if len(proxyHost) > 0:
            self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd)
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls([Special.siteRoot,
                              # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA'
                              ])
Exemplo n.º 4
0
class Special(Callbacks):
    siteRoot = 'http://www.duokan.com'

    (EVT_FIND_LINK, EVT_FIND_BOOK) = range(0, 2)

    def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''):
        Callbacks.__init__(self)
        self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK])
        self.titles = {}
        self.links = {}
        self.authors = {}

        self.callbacks = {'http://www.duokan.com/special': self.findBooks,
                          'http://www.duokan.com/book': self.findBook,
                          'http://www.duokan.com': self.findLinks,
                          # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree,
                          }
        self.spider = Spider('Duokan Special')
        if len(proxyHost) > 0:
            self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd)
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls([Special.siteRoot,
                              # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA'
                              ])

    def findLinks(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        list_nodes = self.soup.findAll('div', attrs={'class': 'u-aimg'})
        if len(list_nodes) > 0:
            list_node = list_nodes[0]
            links = list_node.findAll('a')
            # limit free read
            link = links[0]
            link = [Special.siteRoot + link['href']]
            self.spider.add_urls(link)
            self.dispatch(Special.EVT_FIND_LINK, link[0])
        # limit free buy
        # link = links[2]
        # link = [Special.siteRoot + link['href']]
        # self.spider.add_urls(link)

    def finfLimitFree(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        list_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'})
        if len(list_nodes) > 0:
            list_node = list_nodes[0]
            links = list_node.findAll('a')
            # limit free read
            link = links[0]
            link = [Special.siteRoot + link['href']]
            self.spider.add_urls(link)
            self.dispatch(Special.EVT_FIND_LINK, link[0])

    def findBooks(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        book_nodes = self.soup.findAll('li', attrs={'class': 'u-bookitm1 j-bookitm'})
        for item in book_nodes:
            id = item['data-id']
            if id:
                title = item.find('a', attrs={'class': 'title'}).string
                link = item.find('a', attrs={'class': 'title'})['href']
                author = item.find('div', attrs={'class': 'u-author'}).find('span').string
                self.titles[id] = title
                self.links[id] = Special.siteRoot + link
                self.authors[id] = author
                self.dispatch(Special.EVT_FIND_BOOK, id, self.titles[id], self.authors[id], self.links[id])
        return self.titles

    def findBook(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        # id
        # content = self.soup.find('meta', attrs={'name':'apple-itunes-app'})['content'].split('/')
        # id = content[len(content) - 1]
        # title
        # descNode = self.soup.findAll('div', attrs={'class':'desc'})
        # title = descNode[0].find('h3').string
        # author
        author = ''
        # author = descNode[0].find('td', attrs={'class':'author'}).find('a').string
        # link
        # link = self.soup.find('div', attrs={'class':'cover', 'id':'cover-img'}).find('a')['href']
        # link = DuokanSpecial.siteRoot + link
        # self.dispatch(DuokanSpecial.ON_FIND_BOOK, id, title, author, link)

        scriptNodes = self.soup.findAll('script', attrs={'type': 'text/javascript'})
        for node in scriptNodes:
            str = node.string
            if str:
                if str.find('window.dk_data') > 0:
                    start = str.index('=') + len('=')
                    end = str.index('window.dk_data.comments_url')
                    str = str[start:end]
                    # str = str.strip().lstrip()
                    str = str.replace('book_id :', '\'book_id\' :')
                    str = str.replace('book :', '\'book\' :')
                    str = str.replace('sid :', '\'sid\' :')
                    str = str.replace('id :', '\'id\' :')
                    str = str.replace('title : ', '\'title\' : u')
                    str = str.replace('old_price :', '\'old_price\' :')
                    str = str.replace('price :', '\'price\' :')
                    str = str.replace('cover :', '\'cover\' :')
                    str = str.replace('url :', '\'url\' :')
                    str = str.replace('webreader :', '\'webreader\' :')
                    str = str.replace('limited_time :', '\'limited_time\' :')
                    str = str.replace('authors : ', '\'authors\' : u')
                    # print str
                    dk_data = eval(str)
                    id = dk_data['book']['id']
                    title = dk_data['book']['title']
                    author = dk_data['book']['authors']
                    link = Special.siteRoot + dk_data['book']['url']
                    self.dispatch(Special.EVT_FIND_BOOK, id, title, author, link)

    def start(self):
        self.spider.start()

    def stop(self):
        self.spider.stop()

    def getTitle(self):
        return self.titles

    def getLinks(self):
        return self.links

    def getAuthors(self):
        return self.authors
Exemplo n.º 5
0
class WsjImg:
    site_root = 'http://cn.wsj.com/'
    page_root = 'http://cn.wsj.com/gb/'
    img_root = 'http://cn.wsj.com/pictures/photo/'
    starts = ['http://cn.wsj.com/gb/pho.asp']
    # starts = ['http://cn.wsj.com/gb/20141230/PHO094555.asp']
    # callbacks = {'http://cn.wsj.com/gb/pho.asp':WsjImg.find_links, 'http://cn.wsj.com/gb/':WsjImg.parse_page, 'http://cn.wsj.com/pictures/photo/':WsjImg.save_img}

    # page url path
    # ['', 'gb', '20130528', 'PHO184538.asp']
    idx_page_date = 2
    idx_page_filename = 3
    # img url path
    # ['', 'pictures', 'photo', 'BJ20141226094555', '01.jpg']
    idx_img_dir = 3
    idx_img_filename = 4

    # persist
    DIR_BASE = 'base'
    DIR_ROOT = 'dat'
    DIR_IMG = 'img'

    def __init__(self, start='', end=''):
        self.TAG = WsjImg.__name__
        self.init_date(start, end)
        self.db = WsjPersist()

        self.callbacks = {
                'http://cn.wsj.com/gb/pho.asp': self.find_links, 
                'http://cn.wsj.com/gb/20': self.parse_page,
                'http://cn.wsj.com/pictures/photo/': self.save_img
        }
        self.spider = Spider('WsjImg')
        self.spider.set_proxy('proxy-amer.delphiauto.net:8080', 'rzfwch', '8ik,mju7')
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls(self.starts)
        self.spider.start()

    def init_date(self, strStart='', strEnd=''):
        '''Initiate start/end date'''
        self.strStart = strStart
        self.strEnd = strEnd

    def find_links(self, url, response):
        '''Parse the photos news default page and find photos news page urls'''
        Log.i(self.TAG, 'find links in %s' % url)
        links = ImgPageLinks(response, self.strStart, self.strEnd)
        urls = links.getLinks(response)
        # urls = links.persistToDB(self.db)
        self.spider.add_urls(urls)

    def parse_page(self, url, response):
        '''Parse photos news page, find content and image urls, also with other photos news page urls.'''
        # find img page links
        self.find_links(url, response)
        # process image page.
        imgPage = ImgPage(url, response)
        imgPage.clear()
        imgPage.parseImgUrls()
        if len(imgPage.imgUrls.keys()) > 1:
            imgPage.save(os.path.join(WsjImg.DIR_ROOT, imgPage.filePath))

            with open(os.path.join(WsjImg.DIR_ROOT, imgPage.data['path']), 'w') as f:
                f.write(json.dumps(imgPage.data))

            imgPage.persistToDB(self.db)
            self.db.updateArt(url, imgPage.title, imgPage.summary)

            # save imgs of the page
            self.save_imgs(imgPage)

            # copy base files to here
            # os.system('cp -a %s/* %s/' % (WsjImg.dir_base, os.path.join(WsjImg.dir_root, page_date)))

            self.spider.fetch.copyall(WsjImg.DIR_BASE, os.path.join(WsjImg.DIR_ROOT, imgPage.pageDate))
        else:
            print 'no link find in %s' % url

    def save_img(self, url, response):
        print 'ignore %s' % url

    def save_imgs(self, imgPage):
        for url in imgPage.imgUrls.keys():
            dstfile = os.path.join(WsjImg.DIR_ROOT, imgPage.imgUrls[url]['path'])
            self.spider.download(url, dstfile)
Exemplo n.º 6
0
class Special(Callbacks):
    siteRoot = 'http://www.duokan.com'

    (EVT_FIND_LINK, EVT_FIND_BOOK) = range(0, 2)

    def __init__(self, proxyHost='', proxyAuthUser='', proxyAuthPswd=''):
        Callbacks.__init__(self)
        self.init([Special.EVT_FIND_LINK, Special.EVT_FIND_BOOK])
        self.titles = {}
        self.links = {}
        self.authors = {}

        self.callbacks = {
            'http://www.duokan.com/special': self.findBooks,
            'http://www.duokan.com/book': self.findBook,
            'http://www.duokan.com': self.findLinks,
            # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA': self.finfLimitFree,
        }
        self.spider = Spider('Duokan Special')
        if len(proxyHost) > 0:
            self.spider.set_proxy(proxyHost, proxyAuthUser, proxyAuthPswd)
        self.spider.add_callbacks(self.callbacks)
        self.spider.add_urls([
            Special.siteRoot,
            # 'http://www.duokan.com/r/%E5%85%8D%E8%B4%B9%E4%B8%93%E5%8C%BA'
        ])

    def findLinks(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        list_nodes = self.soup.findAll('div', attrs={'class': 'u-aimg'})
        if len(list_nodes) > 0:
            list_node = list_nodes[0]
            links = list_node.findAll('a')
            # limit free read
            link = links[0]
            link = [Special.siteRoot + link['href']]
            self.spider.add_urls(link)
            self.dispatch(Special.EVT_FIND_LINK, link[0])
        # limit free buy
        # link = links[2]
        # link = [Special.siteRoot + link['href']]
        # self.spider.add_urls(link)

    def finfLimitFree(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        list_nodes = self.soup.findAll('li',
                                       attrs={'class': 'u-bookitm1 j-bookitm'})
        if len(list_nodes) > 0:
            list_node = list_nodes[0]
            links = list_node.findAll('a')
            # limit free read
            link = links[0]
            link = [Special.siteRoot + link['href']]
            self.spider.add_urls(link)
            self.dispatch(Special.EVT_FIND_LINK, link[0])

    def findBooks(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        book_nodes = self.soup.findAll('li',
                                       attrs={'class': 'u-bookitm1 j-bookitm'})
        for item in book_nodes:
            id = item['data-id']
            if id:
                title = item.find('a', attrs={'class': 'title'}).string
                link = item.find('a', attrs={'class': 'title'})['href']
                author = item.find('div', attrs={
                    'class': 'u-author'
                }).find('span').string
                self.titles[id] = title
                self.links[id] = Special.siteRoot + link
                self.authors[id] = author
                self.dispatch(Special.EVT_FIND_BOOK, id, self.titles[id],
                              self.authors[id], self.links[id])
        return self.titles

    def findBook(self, url, response):
        self.soup = BeautifulSoup(response, from_encoding='utf8')
        # id
        # content = self.soup.find('meta', attrs={'name':'apple-itunes-app'})['content'].split('/')
        # id = content[len(content) - 1]
        # title
        # descNode = self.soup.findAll('div', attrs={'class':'desc'})
        # title = descNode[0].find('h3').string
        # author
        author = ''
        # author = descNode[0].find('td', attrs={'class':'author'}).find('a').string
        # link
        # link = self.soup.find('div', attrs={'class':'cover', 'id':'cover-img'}).find('a')['href']
        # link = DuokanSpecial.siteRoot + link
        # self.dispatch(DuokanSpecial.ON_FIND_BOOK, id, title, author, link)

        scriptNodes = self.soup.findAll('script',
                                        attrs={'type': 'text/javascript'})
        for node in scriptNodes:
            str = node.string
            if str:
                if str.find('window.dk_data') > 0:
                    start = str.index('=') + len('=')
                    end = str.index('window.dk_data.comments_url')
                    str = str[start:end]
                    # str = str.strip().lstrip()
                    str = str.replace('book_id :', '\'book_id\' :')
                    str = str.replace('book :', '\'book\' :')
                    str = str.replace('sid :', '\'sid\' :')
                    str = str.replace('id :', '\'id\' :')
                    str = str.replace('title : ', '\'title\' : u')
                    str = str.replace('old_price :', '\'old_price\' :')
                    str = str.replace('price :', '\'price\' :')
                    str = str.replace('cover :', '\'cover\' :')
                    str = str.replace('url :', '\'url\' :')
                    str = str.replace('webreader :', '\'webreader\' :')
                    str = str.replace('limited_time :', '\'limited_time\' :')
                    str = str.replace('authors : ', '\'authors\' : u')
                    # print str
                    dk_data = eval(str)
                    id = dk_data['book']['id']
                    title = dk_data['book']['title']
                    author = dk_data['book']['authors']
                    link = Special.siteRoot + dk_data['book']['url']
                    self.dispatch(Special.EVT_FIND_BOOK, id, title, author,
                                  link)

    def start(self):
        self.spider.start()

    def stop(self):
        self.spider.stop()

    def getTitle(self):
        return self.titles

    def getLinks(self):
        return self.links

    def getAuthors(self):
        return self.authors