Exemplo n.º 1
0
class DoubanId(Spider):
    '''
    '''
    name = 'douban_search'
    site_code = 'douban'
    dbmgr = DbManager.instance()
    #pomgr = DbManager.dynamic_instance("192.168.16.165", "root", "funshion", "db_poseidon_media", 3306)

    def __init__(self, *args, **kwargs):
        super(DoubanId, self).__init__(*args, **kwargs)
        self.site_id = self.dbmgr.get_site_id_by_code(self.site_code)
        #self._search_api = "https://movie.douban.com/subject_search?search_text=%s"
        self._search_api = "https://www.douban.com/search?cat=1002&q=%s" # 更准确
        self._url_api = "https://movie.douban.com/subject/%s/"
        self.seen = set(self.dbmgr.get_douban_media_dou_id())
        #self.titles = self.pomgr.get_fm_media_title()
        self.titles = self.dbmgr.get_media_title()


    def start_requests(self):
        try:
            for title in self.titles:
                t = self.process_str(title)
                yield Request(url=self._search_api % title, callback=self.parse_search)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 2
0
class DoubanList(Spider):
    '''
    '''
    name = 'douban_list'
    site_code = 'douban'
    mgr = DbManager.instance()
    max_number = 10000

    def __init__(self, *args, **kwargs):
        super(DoubanList, self).__init__(*args, **kwargs)
        self.site_id = self.mgr.get_site_id_by_code(self.site_code)

        self.api_movie_tag = 'https://movie.douban.com/j/search_tags?type=%s'  # type in ['movie', 'tv']
        self.api_movie_list = 'https://movie.douban.com/j/search_subjects?type=%s&tag=%s&sort=%s&page_limit=%s&page_start=%s'
        self.api_tag_page = 'https://www.douban.com/tag/%s/?focus=movie'
        self.api_tag_list = 'https://www.douban.com/j/tag/items?start=%s&limit=%s&topic_id=%s&topic_name=%s&mod=movie'
        self.api_latest = 'https://movie.douban.com/j/search_subjects?type=%s&tag=热门&sort=time&page_limit=%s&page_start=0'

    def start_requests(self):
        try:
            items = []
            self.load_member_variable()
            if self.max_number == 0:
                items.extend(self.enter_movie())
            else:
                items.extend(self.enter_latest())
            #items.extend(self.enter_tag())
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
        finally:
Exemplo n.º 3
0
class ImdbChief(Spider):
    '''
    '''
    name = 'imdb_chief'
    site_code = 'douban'
    dbmgr = DbManager.instance()

    def __init__(self, *args, **kwargs):
        super(ImdbChief, self).__init__(*args, **kwargs)
        self._imdb_api = "http://www.imdb.com/title/%s/"
        self.seen = set(self.dbmgr.get_douban_media_imdbs())

    def start_requests(self):
        try:
            for imdb in self.seen:
                yield Request(url=self._imdb_api % imdb,
                              callback=self.parse_chief_imdb,
                              meta={'imdb': imdb})
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 4
0
class DoubanRecommend(Spider):
    '''
    '''
    name = 'douban_recommend'
    db_mgr = DbManager.instance()
    mongo_mgr = MongoMgr.instance()

    def __init__(self, *args, **kwargs):
        super(DoubanRecommend, self).__init__(*args, **kwargs)
        self.seen = set(self.db_mgr.get_douban_media_dou_id()) - set(
            self.mongo_mgr.get_recommend_dou_id())
        self._url_api = "https://movie.douban.com/subject/%s/"
        print len(self.seen)

    def start_requests(self):
        try:
            for dou_id in self.seen:
                url = self._url_api % dou_id
                yield Request(url=url, callback=self.parse_recommend)
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
Exemplo n.º 5
0
 def __init__(self, settings):
     self.mgr = DbManager.instance()
     self.sd_proxy_list = settings.get('SD_PROXY_LIST')
     self.proxies = set(self.sd_proxy_list)
     self.load_proxy_web()
Exemplo n.º 6
0
class DoubanSpread(CrawlSpider):
    name = 'douban_spread'
    site_code = 'douban'
    mgr = DbManager.instance()
    allowed_domains = ['movie.douban.com', 'www.douban.com']

    rules = (
        Rule(LinkExtractor(allow=r'https://movie.douban.com/subject/\d+/$',
                           tags=('a', )),
             callback='parse_media',
             follow=True),
        Rule(LinkExtractor(
            allow=r'https://movie.douban.com/subject/\d+/\?from=.*',
            tags=('a', )),
             callback='parse_media',
             follow=True),
        Rule(LinkExtractor(allow=r'.*/doulist/.*', tags=('a', )), follow=True),
        Rule(LinkExtractor(allow=r'.*/celebrity/\d+/.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/typerank?.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/chart.*', tags=('a', )), follow=True),
        Rule(LinkExtractor(allow=r'https://movie.douban.com/tag.*',
                           tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'https://movie.douban.com/tv.*',
                           tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'https://movie.douban.com/explore.*',
                           tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/awards.*', tags=('a', )), follow=True),
        Rule(LinkExtractor(allow=r'.*/top250.*', tags=('a', )), follow=True),
        Rule(LinkExtractor(allow=r'.*/annual2015.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/people/\w+/', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/people/\w+/do.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/people/\w+/wish.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*/people/\w+/collect.*', tags=('a', )),
             follow=True),
        Rule(LinkExtractor(allow=r'.*nowplaying.*', tags=('a', )),
             follow=True),
    )

    start_urls = ['https://movie.douban.com/']

    def __init__(self, *args, **kwargs):
        super(DoubanSpread, self).__init__(*args, **kwargs)
        self.site_id = self.mgr.get_site_id_by_code(self.site_code)
        self.start_urls.extend(self.mgr.get_douban_media_urls())
        #self.start_urls = ['https://movie.douban.com/subject/1295644/']

    def parse_media(self, response):
        try:
            logging.log(logging.INFO, 'parse_media: %s' % response.request.url)
            mediaItem = common_parse_media(response)
            vcount = mediaItem['vcount'] if 'vcount' in mediaItem else 1
            mediaItem['site_id'] = self.site_id
            print mediaItem
            return mediaItem
        except Exception, e:
            logging.log(logging.ERROR, traceback.format_exc())
            return mediaItem