Пример #1
0
    def parse(self, response):
        if self.max_page == -1:
            self.max_page = int(
                re.findall(
                    '\( \d+ / (\d+) \)',
                    Selector(response=response).xpath(
                        '//span[@class="p_edge"]/text()').extract()[0],
                    re.S)[0])
            for page in range(1, self.max_page):
                self.start_urls.append('http://bangumi.tv/person?page=%s' %
                                       page)

            print(self.start_urls)
            print("get max page: %s" % self.max_page)

        person_id_list = list()
        for person_field in response.selector.xpath(
                '//div[@class="light_odd  clearit"]//h3//a'):
            person_id_list.append(
                BangumiIdItem(bangumi_id=get_field_value(
                    person_field.xpath('./@href').re('/person/(\d+)')),
                              bangumi_type="person",
                              bangumi_name=get_field_value(
                                  person_field.xpath('./text()').extract())))
        return BangumiIdListItem(bangumi_data=person_id_list)
Пример #2
0
    def parse_cv_work(self, response):
        person = response.meta['person']
        root_selector = Selector(response)
        for work_field in root_selector.xpath('//*[@id="browserItemList"]/li'):
            cv_work = PersonCharacterVoiceWork()
            if len(work_field.xpath('.//div/h3/small/text()')) == 0:
                cv_work['japan_name'] = get_field_value(work_field.xpath('.//div/h3/a/text()').extract())
            else:
                cv_work['japan_name'] = work_field.xpath('.//div/h3/small/text()').extract()[0] if len(
                    work_field.xpath('.//div/h3/small/text()').extract()[0]) != 0 else None
                cv_work['chinese_name'] = get_field_value(work_field.xpath('.//div/h3/a/text()').extract())
            cv_work['bangumi_id'] =get_field_value( work_field.xpath('.//div/h3/a/@href').re('subject/(\d+)'))
            cv_work['job'] = work_field.xpath('.//span[@class="badge_job"]/text()').extract()
            person['works'].append(cv_work)
        try:
            if root_selector.xpath('//*[@id="columnCrtB"]/div[3]/div/div/a/text()').extract()[-1] == "››":
                next_url = root_selector.xpath('//*[@id="columnCrtB"]/div[3]/div/div/a/@href').extract()[-1]
                request = scrapy.Request('http://bangumi.tv/person/%s/works%s' % (person['bangumi_id'], next_url),
                                         callback=self.parse_cv_work)
                request.meta['person'] = person
                return request
        except:
            return person
            pass

        return person
Пример #3
0
 def parse_music_artist(self, response):
     music = response.meta['music']
     root_selector = Selector(response)
     music['artist'] = list()
     for artist_field in root_selector.xpath('//*[@id="columnInSubjectA"]/div'):
         artist = MusicArtist()
         artist['japan_name'] = artist_field.xpath('./div/h2/a/text()').extract()[0].replace('  / ', '')
         artist['bangumi_id'] = get_field_value(artist_field.xpath('./div/h2/a/@href').re('/person/(\d+)'))
         artist['chinese_name'] = get_field_value(artist_field.xpath('./div/h2/a/span/text()').extract())
         artist['job'] = get_field_value(artist_field.re('<span class="badge_job">(.*?)</span>'))
         music['artist'].append(artist)
     return music
Пример #4
0
    def parse(self, response):
        logging.info("loading")
        music = Music()
        root_selector = Selector(response)
        music['bangumi_id'] = get_field_value(
            root_selector.xpath('//*[@id="headerSubject"]/h1/a/@href').re('/subject/(\d+)'))
        music['name'] = get_field_value(root_selector.xpath('//*[@id="headerSubject"]/h1/a/text()').extract())
        music['detail'] = root_selector.xpath('//*[@id="subject_summary"]/text()')
        if len(music['detail']) == 0:
            music['detail'] = None
        else:
            music['detail'] = "".join(music['detail'].extract())
        try:
            music['cover_url'] = 'http:%s' % root_selector.xpath('//*[@id="bangumiInfo"]/div/div/a/@href').extract()[0]
        except:
            music['cover_url'] = None
        music['cover_prefix'] = 'music'
        music_info_selector = Selector(response=response).xpath('//*[@id="infobox"]/li')
        music['info'] = dict()
        for item in music_info_selector:
            music_info_title = item.xpath('./span/text()').extract()[0][:-2]
            if music_info_title == '发售日期':
                try:
                    music['publish_date'] = datetime.datetime.strptime(
                        get_field_value(item.xpath('./text()').extract()),
                        "%Y-%m-%d")
                except:
                    pass

            music['info'][music_info_title] = list()
            li_content = (item.extract())
            # 处理获取信息
            if li_content.find('<a') == -1:
                # 去除多余信息
                li_content = re.sub('<span class="tip">(.*?)</span>', "", li_content)
                li_content = re.sub("<.*?>", "", li_content)
                if li_content.find("、") != -1:
                    value_text_set = li_content.split("、")
                    for i in value_text_set:
                        music['info'][music_info_title].append(i)
                else:
                    music['info'][music_info_title].append(li_content)
            else:
                for value in item.xpath('.//a/text()').extract():
                    music['info'][music_info_title].append(value)
        tag_field = response.xpath('//*[@class="subject_tag_section"]//span/text()')
        music['tag'] = [tag.extract() for tag in tag_field]

        request = Request('http://bangumi.tv/subject/%s/ep' % music['bangumi_id'], callback=self.parse_music_tract)
        request.meta['music'] = music
        return request
Пример #5
0
    def parse(self, response):
        book = BookItem()
        root_selector = Selector(response)
        book['book_type'] = get_field_value(response.xpath('//*[@id="headerSubject"]/h1/small/text()').extract())
        book['bangumi_id'] = get_field_value(
            root_selector.xpath('//*[@id="headerSubject"]/h1/a/@href').re('/subject/(\d+)'))
        book['name'] = get_field_value(root_selector.xpath('//*[@id="headerSubject"]/h1/a/text()').extract())
        book['detail'] = ''.join(root_selector.xpath('//*[@id="subject_summary"]/text()').extract()) if len(
            root_selector.xpath('//*[@id="subject_summary"]/text()').extract()) != 0 else None
        book['cover_url'] = 'http:%s' % get_field_value(
            root_selector.xpath('//*[@id="bangumiInfo"]/div/div/a/@href').extract())
        book['cover_prefix'] = 'book'
        tag_field = response.xpath('//*[@class="subject_tag_section"]//span/text()')
        book['tag'] = [tag.extract() for tag in tag_field]
        book_info_selector = Selector(response=response).xpath('//*[@id="infobox"]/li')
        book['info'] = dict()
        for item in book_info_selector:
            book_info_title = item.xpath('./span/text()').extract()[0][:-2]
            if book_info_title == '作者':
                book['author'] = get_field_value(item.xpath('./text()').extract())
                continue
            elif book_info_title == '出版社':
                book['press'] = get_field_value(item.xpath('./text()').extract())
                continue
            elif book_info_title == '发售日':
                try:
                    book['publish_date'] = datetime.datetime.strptime(get_field_value(item.xpath('./text()').extract()),
                                                                      "%Y-%m-%d")
                except:
                    pass
                continue
            elif book_info_title == 'ISBN':
                book['ISBN'] = get_field_value(item.xpath('./text()').extract())
                continue
            elif book_info_title == '页数':
                book['page'] = int(get_field_value(item.xpath('./text()').re('(\d+)')))
                continue

            book['info'][book_info_title] = list()
            li_content = (item.extract())
            # 处理获取信息
            if li_content.find('<a') == -1:
                # 去除多余信息
                li_content = re.sub('<span class="tip">(.*?)</span>', "", li_content)
                li_content = re.sub("<.*?>", "", li_content)
                if li_content.find("、") != -1:
                    value_text_set = li_content.split("、")
                    for i in value_text_set:
                        book['info'][book_info_title].append(i)
                else:
                    book['info'][book_info_title].append(li_content)
            else:
                for value in item.xpath('.//a/text()').extract():
                    book['info'][book_info_title].append(value)
        return book
Пример #6
0
    def parse_game_cast(self, response):
        game = response.meta['game']
        cast_set_field = Selector(response).xpath('//*[@id="columnInSubjectA"]/div[@class = "light_odd clearit"]')
        game['cast'] = list()
        for cast_item_field in cast_set_field:
            cast = GameCast()
            cast['bangumi_id'] = get_field_value(cast_item_field.xpath('.//h2/a').re('/person/(\d+)'))
            cast['japan_name'] = get_field_value(
                cast_item_field.xpath('.//h2/a/text()').extract()).replace(" / ", "") if get_field_value(
                cast_item_field.xpath('.//h2/a/text()').extract()) is not None else None
            cast['chinese_name'] = get_field_value(cast_item_field.xpath('.//h2/a/span/text()').extract())

            cast['job'] = list()
            for job in cast_item_field.xpath('.//div[@class = "prsn_info"]//span[@class = "badge_job"]/text()'):
                cast['job'].append(job.extract())
            game['cast'].append(cast)
        return game
Пример #7
0
    def parse_game_character(self, response):
        game = response.meta['game']
        game['character'] = list()
        character_field = Selector(response).xpath('//*[@id="columnInSubjectA"]')
        for character in character_field.xpath('.//div[@class="clearit"]'):
            game_character = GameCharacter()
            game_character['cv'] = list()
            game_character['bangumi_id'] = get_field_value(character.re('<a href="/character/(\d+)" class="l">'))

            game_character['japan_name'] = get_field_value(character.xpath('./h2/a/text()').extract())
            game_character['chinese_name'] = get_field_value(character.xpath('./h2/span/text()').extract())[3:]
            game_character['job'] = get_field_value(character.re('<span class="badge_job">(.*?)</span>'))

            # cv information
            cv_list_field = character.xpath('.//div[@class = "actorBadge clearit"]/p')
            for cv_list in cv_list_field:
                cv = GameCharacterVoice()
                cv['bangumi_id'] = get_field_value(cv_list.re('<a href="/person/(\d+)" class="l">'))
                cv['japan_name'] = get_field_value(cv_list.xpath('./a/text()').extract())

                cv['chinese_name'] = get_field_value(cv_list.xpath('./small/text()').extract())
                game_character['cv'].append(cv)
            game['character'].append(game_character)
        request = Request(url='http://bangumi.tv/subject/%s/persons' % game['bangumi_id'],
                          callback=self.parse_game_cast)
        request.meta['game'] = game
        return request
Пример #8
0
 def parse_cv_character(self, response):
     person = response.meta['person']
     person['character'] = list()
     root_selector = Selector(response)
     for character_li in root_selector.xpath('//*[@id="columnCrtB"]/div[2]/ul/li'):
         character = PersonCharacter()
         character['japan_name'] = get_field_value(
             character_li.xpath('.//div[@class="ll innerLeftItem"]/a/@title').extract())
         character['bangumi_id'] = get_field_value(
             character_li.xpath('.//div[@class="ll innerLeftItem"]/a/@href').re('/character/(\d+)'))
         character['chinese_name'] = get_field_value(
             character_li.xpath('.//div[@class="ll innerLeftItem"]//p/text()').extract())
         character['work'] = list()
         for work in character_li.xpath('.//ul/li'):
             character_work = PersonCharacterVoiceWork()
             character_work['bangumi_id'] = get_field_value(work.xpath('.//div/h3/a/@href').re('/subject/(\d+)'))
             character_work['japan_name'] = get_field_value(work.xpath('.//div/h3/a/text()').extract())
             character_work['chinese_name'] = get_field_value(work.xpath('.//div//small/text()').extract())
             character['job'] = get_field_value(work.xpath('.//div/span/text()').extract())
             character['work'].append(character_work)
         person['character'].append(character)
     request = scrapy.Request('http://bangumi.tv/person/%s/works?sort=date&page=1' % person['bangumi_id'],
                              callback=self.parse_cv_work)
     request.meta['person'] = person
     return request
Пример #9
0
 def parse_music_tract(self, response):
     music = response.meta['music']
     root_selector = Selector(response)
     cat_cur = ""
     music['track'] = OrderedDict()
     for episode_field in root_selector.xpath(
             '//*[@id="columnInSubjectA"]/div/ul/li'):
         episode_type = get_field_value(
             episode_field.xpath('./@class').extract())
         if episode_type == 'cat':
             cat_cur = get_field_value(
                 episode_field.xpath('./text()').extract())
             music['track'][cat_cur] = list()
         else:
             track = MusicTrack()
             track['name'] = get_field_value(
                 episode_field.xpath('./h6/a/text()').extract())
             track['bangumi_id'] = get_field_value(
                 episode_field.xpath('./h6/a/@href').re('/ep/(\d+)'))
             track['number'] = get_field_value(
                 episode_field.xpath('./h6/a/text()').re('^(\d+)'))
             track['artist'] = get_field_value(
                 episode_field.xpath('./h6/a/text()').re('\((.*?)\)$'))
             track['name'] = track['name'].replace(
                 '%s.' % track['number'],
                 '').replace('(%s)' % track['artist'], '')
             music['track'][cat_cur].append(track)
     request = Request('http://bangumi.tv/subject/%s/persons' %
                       music['bangumi_id'],
                       callback=self.parse_music_artist)
     request.meta['music'] = music
     return request
Пример #10
0
    def parse(self, response):
        if self.max_page == -1:
            self.max_page = int(
                re.findall(
                    '\( \d+ / (\d+) \)',
                    Selector(response=response).xpath(
                        '//span[@class="p_edge"]/text()').extract()[0],
                    re.S)[0])
            for page in range(1, self.max_page):
                self.start_urls.append(
                    'http://bangumi.tv/character?orderby=collects&page=%s' %
                    page)

        character_id_list = list()
        for character_field in response.selector.xpath(
                '//div[@class="light_odd  clearit"]//h3//a'):
            character_id_list.append(
                BangumiIdItem(bangumi_id=get_field_value(
                    character_field.xpath('./@href').re('/character/(\d+)')),
                              bangumi_type="character",
                              bangumi_name=get_field_value(
                                  character_field.xpath(
                                      './text()').extract())))
        return BangumiIdListItem(bangumi_data=character_id_list)
Пример #11
0
 def parse(self, response):
     game = Game()
     root_selector = Selector(response)
     game['bangumi_id'] = get_field_value(
         root_selector.xpath('//*[@id="headerSubject"]/h1/a/@href').re('/subject/(\d+)'))
     game['name'] = get_field_value(root_selector.xpath('//*[@id="headerSubject"]/h1/a/text()').extract())
     game['detail'] = ''.join(
         get_field_value(root_selector.xpath('//*[@id="subject_summary"]/text()').extract())) if len(
         root_selector.xpath('//*[@id="subject_summary"]/text()').extract()) != 0 else None
     try:
         game['cover_url'] = 'http:%s' % root_selector.xpath('//*[@id="bangumiInfo"]/div/div/a/@href').extract()[0]
     except:
         game['cover_url'] = None
     game['cover_prefix'] = 'game'
     game_info_selector = Selector(response=response).xpath('//*[@id="infobox"]/li')
     game['info'] = dict()
     game['platform'] = list()
     for item in game_info_selector:
         game_info_title = item.xpath('./span/text()').extract()[0][:-2]
         if game_info_title == '中文名':
             try:
                 game['chinese_name'] = get_field_value(item.xpath('./text()').extract())
                 continue
             except:
                 pass
         if game_info_title == '发行日期':
             try:
                 game['release_date'] = datetime.datetime.strptime(item.xpath('./text()').extract()[0], "%Y年%m月%d日")
                 continue
             except:
                 try:
                     game['release_date'] = datetime.datetime.strptime(item.xpath('./text()').extract()[0],
                                                                       "%Y-%m-%d")
                     continue
                 except:
                     pass
         if game_info_title == '平台':
             game['platform'].append(get_field_value(item.xpath('./text()').extract()))
             continue
         game['info'][game_info_title] = list()
         li_content = (item.extract())
         # 处理获取信息
         if li_content.find('<a') == -1:
             # 去除多余信息
             li_content = re.sub('<span class="tip">(.*?)</span>', "", li_content)
             li_content = re.sub("<.*?>", "", li_content)
             if li_content.find("、") != -1:
                 value_text_set = li_content.split("、")
                 for i in value_text_set:
                     game['info'][game_info_title].append(i)
             else:
                 game['info'][game_info_title].append(li_content)
         else:
             for value in item.xpath('.//a/text()').extract():
                 game['info'][game_info_title].append(value)
     request = Request(url='http://bangumi.tv/subject/%s/characters' % game['bangumi_id'],
                       callback=self.parse_game_character)
     request.meta['game'] = game
     print(game['chinese_name'])
     print(game['release_date'] if game['release_date'] is not None else "No date")
     print(game['platform'])
     return request
Пример #12
0
    def parse(self, response):
        person = Person(works=[], character=[])
        root_selector = Selector(response)
        person['bangumi_id'] = get_field_value(
            root_selector.xpath('//*[@id="headerSubject"]/h1/a/@href').re('/person/(\d+)'))
        person['japan_name'] = get_field_value(root_selector.xpath('//*[@id="headerSubject"]/h1/a/text()').extract())
        person['chinese_name'] = get_field_value(
            root_selector.xpath('//*[@id="headerSubject"]/h1/small/text()').extract())
        person['job'] = get_field_value(
            root_selector.xpath('//*[@id="columnCrtB"]/div[1]/h2/text()').re(' {1}(\S{1,}?) {1}'))
        person['detail'] = ''.join(root_selector.xpath('//*[@id="columnCrtB"]/div[2]/text()').extract()) if len(
            root_selector.xpath('//*[@id="columnCrtB"]/div[2]/text()').extract()) != 0 else None

        person['cover_url'] = 'http://%s' % root_selector.xpath('//*[@id="columnCrtA"]/div[1]/div/a/@href').extract()[
            0] if len(root_selector.xpath('//*[@id="columnCrtA"]/div[1]/div/a/@href').extract()) != 0 else ''
        print(person['cover_url'])
        person['cover_prefix'] = 'person'
        person['info'] = dict()
        person['other_name'] = list()
        for info_item in root_selector.xpath('//*[@id="infobox"]/li'):

            name = info_item.xpath('.//text()').extract()[0][:-2]
            value = info_item.xpath('.//text()').extract()[1]

            if name == '性别':
                person['sex'] = value
                continue

            if name == '别名':
                person['other_name'].append(value)
                continue
            if name == '生日':
                try:
                    person['birthday'] = datetime.datetime.strptime(value, "%Y年%m月%d日")
                    continue
                except:
                    try:
                        person['birthday'] = datetime.datetime.strptime(value, "%Y-%m-%d")
                        continue
                    except:
                        pass
                    pass
            if name == '血型':
                person['blood'] = value
                continue
            if name == '出道时间':
                try:
                    person['debut'] = datetime.datetime.strptime(value, "%Y")
                    continue
                except:
                    pass
            if name == '身高':
                try:
                    person['height'] = float(value[:-2])
                    continue
                except:
                    pass

            if name in person['info']:
                person['info'][name].append(value)
            else:
                person['info'][name] = [value]

        if root_selector.xpath('//*[@id="headerSubject"]/div/ul/li[2]/a/text()').extract()[0] == "角色":
            request = scrapy.Request('http://bangumi.tv/person/%s/works/voice' % person['bangumi_id'],
                                     callback=self.parse_cv_character)
            request.meta['person'] = person
            return request
        else:
            request = scrapy.Request(
                'http://bangumi.tv/person/%s/works?sort=date&page=1' % person['bangumi_id'],
                callback=self.parse_cv_work)
            request.meta['person'] = person
            return request
Пример #13
0
    def parse(self, response):
        character = Character()
        root_selector = Selector(response)

        character['chinese_name'] = get_field_value(
            root_selector.xpath(
                '//*[@id="headerSubject"]/h1/small/text()').extract())
        character['japan_name'] = get_field_value(
            root_selector.xpath(
                '//*[@id="headerSubject"]/h1/a/text()').extract())
        character['bangumi_id'] = get_field_value(
            root_selector.xpath('//*[@id="headerSubject"]/h1/a/@href').re(
                '/character/(\d+)'))

        character['cover_url'] = 'http:%s' % get_field_value(
            root_selector.xpath(
                '//*[@id="columnCrtA"]/div[1]/div/a/@href').extract())
        character['cover_prefix'] = 'character'
        info_dict = dict()
        character['other_name'] = list()
        for info_item in root_selector.xpath('//*[@id="infobox"]/li'):

            name = info_item.xpath('.//text()').extract()[0][:-2]
            value = info_item.xpath('.//text()').extract()[1]
            if name == '别名':
                character['other_name'].append(value)
                continue
            if name == 'BWH':
                character['BWH'] = value
                continue
            if name == '生日':
                try:
                    character['birthday'] = datetime.datetime.strptime(
                        value, "%m月%d日")
                    continue
                except:
                    try:
                        character['birthday'] = datetime.datetime.strptime(
                            value, "%m-%d")
                        continue
                    except:
                        pass
                    pass
            if name == '性别':
                character['sex'] = value
                continue
            if name == '身高':
                try:
                    character['height'] = float(value[:-2])
                    continue
                except:
                    pass
            if name in info_dict:
                info_dict[name].append(value)
            else:
                info_dict[name] = [value]

        character['info'] = info_dict

        detail = root_selector.xpath(
            '//*[@id="columnCrtB"]/div[2]/text()').extract()
        if len(detail) == 0:
            detail = ""
        else:
            detail = "".join(detail)

        character['detail'] = detail
        play_list = list()
        for play_field in root_selector.xpath('//*[@id="columnCrtB"]/ul/li'):
            character_play = CharacterPlay()
            character_play['bangumi_id'] = get_field_value(
                play_field.xpath('.//h3/a/@href').re('subject/(\d+)'))
            character_play['japan_name'] = get_field_value(
                play_field.xpath('.//h3/a/text()').extract())
            character_play['chinese_name'] = get_field_value(
                play_field.xpath('./div/div/small/text()').extract())
            character_play['job'] = get_field_value(
                play_field.xpath('./div/div/span/text()').extract())
            character_play['character_voice_bangumi_id'] = get_field_value(
                play_field.xpath('./ul/li/div/h3/a/@href').re('/person/(\d+)'))
            character_play['character_voice_name'] = get_field_value(
                play_field.xpath('./ul/li/div/h3/a/text()').extract())
            play_list.append(character_play)
        character['play'] = play_list
        return character