Пример #1
0
 def get_captcha(self, path = None):
     if not requests.utils.dict_from_cookiejar(self.cj).has_key('sid'):
         utils.get_page_content(LOGIN_CAPTCHA_URL.format(random.random()),
                                headers = {'Referer':'https://passport.bilibili.com/login'})
     result = utils.get_page_content(LOGIN_CAPTCHA_URL.format(random.random()),
                                     headers = {'Referer':'https://passport.bilibili.com/login'})
     if path == None:
         path = tempfile.gettempdir() + '/captcha.jpg'
     with open(path, 'wb') as f:
         f.write(result)
     return path
Пример #2
0
    def get_category_from_web_page(self):
        category_dict = {'0': {'title': u'全部', 'url': HOME_URL, 'subs': []}}
        node = category_dict['0']
        url = node['url']
        result = BeautifulSoup(utils.get_page_content(url),
                               "html.parser").findAll('li', {'class': 'm-i'})
        for item in result:
            if len(item['class']) != 1:
                continue
            tid = item['data-tid']
            title = item.em.contents[0]
            url = 'http:' + item.a['href']
            category_dict[tid] = {'title': title, 'url': url, 'subs': []}
            node['subs'].append(tid)

        #Fix video and movie
        if '11' not in category_dict['0']['subs']:
            category_dict['0']['subs'].append('11')
        if '23' not in category_dict['0']['subs']:
            category_dict['0']['subs'].append('23')
        category_dict['11'] = {
            'title': u'电视剧',
            'url': 'http://bangumi.bilibili.com/tv/',
            'subs': []
        }
        category_dict['23'] = {
            'title': u'电影',
            'url': 'http://bangumi.bilibili.com/movie/',
            'subs': []
        }

        for sub in category_dict['0']['subs']:
            node = category_dict[sub]
            url = node['url']
            result = BeautifulSoup(utils.get_page_content(url),
                                   "html.parser").select('ul.n_num li')
            for item in result[1:]:
                if not item.has_attr('tid'):
                    continue
                if not hasattr(item, 'a'):
                    continue
                if item.has_attr('class'):
                    continue
                tid = item['tid']
                title = item.a.contents[0]
                if item.a['href'][:2] == '//':
                    url = 'http:' + item.a['href']
                else:
                    url = HOME_URL + item.a['href']
                category_dict[tid] = {'title': title, 'url': url, 'subs': []}
                node['subs'].append(tid)
        return category_dict
Пример #3
0
 def get_captcha(self, path=None):
     if not requests.utils.dict_from_cookiejar(self.cj).has_key('sid'):
         utils.get_page_content(
             LOGIN_CAPTCHA_URL.format(random.random()),
             headers={'Referer': 'https://passport.bilibili.com/login'})
     result = utils.get_page_content(
         LOGIN_CAPTCHA_URL.format(random.random()),
         headers={'Referer': 'https://passport.bilibili.com/login'})
     if path == None:
         path = tempfile.gettempdir() + '/captcha.jpg'
     with open(path, 'wb') as f:
         f.write(result)
     return path
Пример #4
0
 def get_captcha(self, path=None):
     utils.get_page_content('https://passport.bilibili.com/login')
     result = utils.get_page_content(
         LOGIN_CAPTCHA_URL,
         headers={
             'Referer':
             'https://passport.bilibili.com/ajax/miniLogin/minilogin'
         })
     if path == None:
         path = tempfile.gettempdir() + '/captcha.jpg'
     with open(path, 'wb') as f:
         f.write(result)
     return path
Пример #5
0
 def _parse_urls(self, page_content, need_subtitle = True):
     self._print_info('Parsing page')
     url_params = self.URL_PARAMS.findall(page_content)
     interface_full_url = ''
     # 如果使用第一种正则匹配成功
     if url_params and len(url_params) == 1 and url_params[0]:
         interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     # 如果匹配不成功则使用第二种正则匹配
     if not url_params:
         self._print_info('Parsing page by another regex')
         url_params = self.URL_PARAMS2.findall(page_content)
         if url_params and len(url_params) == 1 and url_params[0]:
             interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     if interface_full_url:
         self._print_info('Interface url: ' + interface_full_url)
         # 解析RSS页面
         self._print_info('Getting video address by interface page')
         content = utils.get_page_content(interface_full_url)
         self._print_info('Interface page length: ' + str(len(content)))
         doc = minidom.parseString(content)
         parts = doc.getElementsByTagName('durl')
         self._print_info('Video parts found: ' + str(len(parts)))
         result = []
         # 找出所有视频地址
         for part in parts:
             urls = part.getElementsByTagName('url')
             if len(urls) > 0:
                 result.append(urls[0].firstChild.nodeValue)
         if need_subtitle:
             return (result, self._parse_subtitle(url_params[0]))
         else:
             return (result, '')
     else:
         self._print_info('Interface url not found!')
     return ([], '')
Пример #6
0
 def _parse_urls(self, page_content):
     url_params = self.URL_PARAMS.findall(page_content)
     interface_full_url = ''
     # 如果使用第一种正则匹配成功
     if url_params and len(url_params) == 1 and url_params[0]:
         interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     # 如果匹配不成功则使用第二种正则匹配
     if not url_params:
         url_params = self.URL_PARAMS2.findall(page_content)
         if url_params and len(url_params) == 1 and url_params[0]:
             interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     if interface_full_url:
         # 解析RSS页面
         content = utils.get_page_content(interface_full_url)
         doc = minidom.parseString(content)
         parts = doc.getElementsByTagName('durl')
         result = []
         # 找出所有视频地址
         for part in parts:
             urls = part.getElementsByTagName('url')
             if len(urls) > 0:
                 result.append(urls[0].firstChild.nodeValue)
         return (result, self._parse_subtitle(url_params[0]))
     print interface_full_url
     return ([], '')
Пример #7
0
    def get_next_page(self,list,bangumi,page):
        p_url='http://www.bilibili.com/sppage/bangumi-'+str(bangumi)+'-'+str(page)+'.html'
        html= utils.get_page_content(p_url)

        html = html.replace('\r', '')
        html = html.replace('\n', '')
        checknext = re.compile('<div class="no_more">(.+?)</div>').findall(html)
        if len(checknext):
            if checknext[0]=="没有更多信息":
                return list

        else:
            thumbnail=""
            cover = re.compile('<div class="cover"><img src="(.+?)"></div>').findall(html)
            if len(cover):
                thumbnail=cover[0]


            series = re.compile('<a class="t" href="/video/(.+?)" target="_blank" title="(.+?)">(.+?)</a>').findall(html)

            if len(series):
                for s in series:
                    list.append({
                        'title': s[1].strip(),
                        'link': s[0].strip(),
                        'type': 'bangumi',
                        'page': page,
                        'thumbnail':thumbnail,
                        'published': s[0].strip()})

            return self.get_next_page(list,bangumi,page+1)
Пример #8
0
 def get_video_urls(self, url, need_subtitle=True):
     self._print_info('Getting video address')
     page_full_url = self.BASE_URL + url
     self._print_info('Page url: ' + page_full_url)
     page_content = utils.get_page_content(page_full_url)
     self._print_info('Origin page length: ' + str(len(page_content)))
     return self._parse_urls(page_content, need_subtitle)
Пример #9
0
 def get_video_urls(self, url, need_subtitle=True):
     self._print_info('Getting video address')
     page_full_url = self.BASE_URL + url
     self._print_info('Page url: ' + page_full_url)
     page_content = utils.get_page_content(page_full_url)
     self._print_info('Origin page length: ' + str(len(page_content)))
     return self._parse_urls(page_content, need_subtitle)
Пример #10
0
 def _parse_urls(self, page_content, need_subtitle = True):
     self._print_info('Parsing page')
     url_params = self.URL_PARAMS.findall(page_content)
     interface_full_url = ''
     # 如果使用第一种正则匹配成功
     if url_params and len(url_params) == 1 and url_params[0]:
         interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     # 如果匹配不成功则使用第二种正则匹配
     if not url_params:
         self._print_info('Parsing page by another regex')
         url_params = self.URL_PARAMS2.findall(page_content)
         if url_params and len(url_params) == 1 and url_params[0]:
             interface_full_url = self.INTERFACE_URL.format(str(url_params[0]))
     if interface_full_url:
         self._print_info('Interface url: ' + interface_full_url)
         # 解析RSS页面
         self._print_info('Getting video address by interface page')
         content = utils.get_page_content(interface_full_url)
         self._print_info('Interface page length: ' + str(len(content)))
         doc = minidom.parseString(content)
         parts = doc.getElementsByTagName('durl')
         self._print_info('Video parts found: ' + str(len(parts)))
         result = []
         # 找出所有视频地址
         for part in parts:
             urls = part.getElementsByTagName('url')
             if len(urls) > 0:
                 result.append(urls[0].firstChild.nodeValue)
         if need_subtitle:
             return (result, self._parse_subtitle(url_params[0]))
         else:
             return (result, '')
     else:
         self._print_info('Interface url not found!')
     return ([], '')
Пример #11
0
    def get_hot_items(self, category):
        self._print_info('Getting HOT CAT Items')
        self._print_info(category)
        cat_url = self._get_cat_url(category)
        self._print_info(cat_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('HOT CAT Items fetched succeeded!')

        html = utils.get_page_content(cat_url)

        temp = []
        pager = re.compile('<ul class="rlist">(.+?)</ul>').findall(html)

        if len(pager):
            links = re.compile(
                '<a href="/video/(.+?)/" title="(.+?)" target="_blank">(.+?)</a>'
            ).findall(pager[0])

            for p in links:
                img = re.compile('<img src="(.+?)"').findall(p[2])
                temp.append({
                    'title': p[1],
                    'link': p[0],
                    'category': category,
                    'description': p[0],
                    'thumbnail': img[0],
                    'published': p[0]
                })

        return temp
Пример #12
0
 def get_dynamic(self, page = 1, pagesize = 10):
     if self.is_login == False:
         return []
     url = DYNAMIC_URL.format(pagesize, page)
     result = json.loads(utils.get_page_content(url))
     total_page = int((result['data']['page']['count'] + pagesize - 1) / pagesize)
     return result['data']['feeds'], total_page
Пример #13
0
    def get_hot_items(self, category):
        self._print_info('Getting HOT CAT Items')
        self._print_info(category)
        cat_url = self._get_cat_url(category)
        self._print_info(cat_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('HOT CAT Items fetched succeeded!')
 
        html= utils.get_page_content(cat_url)


        temp=[]
        pager = re.compile('<ul class="rlist">(.+?)</ul>').findall(html)

        if len(pager):
            links = re.compile('<a href="/video/(.+?)/" title="(.+?)" target="_blank">(.+?)</a>').findall(pager[0])
            

            for p in links:
                img = re.compile('<img src="(.+?)"').findall(p[2])
                temp.append({
                    'title': p[1],
                    'link': p[0],
                    'category':category,
                    'description': p[0],
                    'thumbnail':img[0],
                    'published': p[0]})



        return temp
Пример #14
0
 def _get_index_items(self, url):
     pickle_file_by_word = tempfile.gettempdir() + '/' + url.split('/')[-1].strip() + '_word_tmp.pickle'
     pickle_file_by_month = tempfile.gettempdir() + '/' + url.split('/')[-1].strip() + '_month_tmp.pickle'
     if os.path.exists(pickle_file_by_word) and os.path.exists(pickle_file_by_month) and not self._need_rebuild(pickle_file_by_word) and not self._need_rebuild(pickle_file_by_month):
         return pickle.load(open(pickle_file_by_word, 'rb')), pickle.load(open(pickle_file_by_month, 'rb'))
     else:
         page_content = utils.get_page_content(url)
         results_dict = dict()
         results_month_dict = dict()
         parts = page_content.split('<h3>')
         for part in parts:
             results = self.ITEMS.findall(part)
             key = part[0]
             results_dict[key] = []
             for r in results:
                 results_dict[key].append((r[1], r[2], r[0]))
                 if r[0] in results_month_dict.keys():
                     results_month_dict[r[0]].append((r[1], r[2]))
                 else:
                     results_month_dict[r[0]] = [(r[1], r[2])]
         word_file = open(pickle_file_by_word, 'wb')
         month_file = open(pickle_file_by_month, 'wb')
         pickle.dump(results_dict, word_file)
         pickle.dump(results_month_dict, month_file)
         return results_dict, results_month_dict
Пример #15
0
 def get_video_list(self, av_id):
     page_full_url = self.BASE_URL + 'video/av' + str(av_id) + '/'
     page_content = utils.get_page_content(page_full_url)
     parts = self.PARTS.findall(page_content)
     if len(parts) == 0:
         return [(u'播放', 'video/av' + str(av_id) + '/')]
     else:
         return [(part[1], part[0][1:]) for part in parts]
Пример #16
0
 def get_video_list(self, av_id):
     page_full_url = self.BASE_URL + 'video/av' + str(av_id) + '/'
     page_content = utils.get_page_content(page_full_url)
     parts = self.PARTS.findall(page_content)
     if len(parts) == 0:
         return [(u'播放', 'video/av' + str(av_id) + '/')]
     else:
         return [(part[1], part[0][1:]) for part in parts]
Пример #17
0
 def get_dynamic(self, page=1, pagesize=10):
     if self.is_login == False:
         return []
     url = DYNAMIC_URL.format(pagesize, page)
     result = json.loads(utils.get_page_content(url))
     total_page = int(
         (result['data']['page']['count'] + pagesize - 1) / pagesize)
     return result['data']['feeds'], total_page
Пример #18
0
 def get_bangumi_detail(self, season_id):
     url = BANGUMI_SEASON_URL.format(season_id)
     result = utils.get_page_content(url)
     if result[0] != '{':
         start = result.find('(') + 1
         end = result.find(');')
         result = result[start:end]
     result = json.loads(result)
     return result['result']
Пример #19
0
 def fetch(cls, url, use_cache=True):
     m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url)
     page_lang = m.group(1).encode('utf8')
     page_title = extract_page_title(url, page_lang)
     wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang)
     return cls(
         page_title,
         get_page_content(wp, page_title, page_lang, use_cache) or '',
         page_lang)
Пример #20
0
    def get_cat_items(self, category):
        self._print_info('Getting CAT Items')
        self._print_info(category)
        cat_url = self._get_cat_url(category)
        self._print_info(cat_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('CAT Items fetched succeeded!')

        html = utils.get_page_content(cat_url)
        attrs = re.compile(
            '<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview" title="(.+?)"><img src="(.+?)" alt="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title" title="(.+?)">(.+?)</a>'
        ).findall(html)

        #<a href="/video/av2315815/" target="_blank" class="preview" title="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"><img src="http://i2.hdslb.com/320_180/video/0f/0f3662fcc909ad31da2963108ea4d9f6.jpg" alt="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"></a>

        temp = [{
            'title': i[1],
            'description': i[1],
            'link': '',
            'category': category,
            'thumbnail': i[2],
            'published': i[0]
        } for i in attrs]

        #for t in temp:
        #    for tt in t:
        #        self._print_info(t[tt])

        pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(
            html)

        if len(pager):
            links = re.compile('href="/video/(.+?)">(.+?)</a>').findall(
                pager[0])

            for p in links:
                if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[
                        1] == '上页 ':
                    temp.append({
                        'title': p[1],
                        'link': p[0],
                        'category': p[0],
                        'description': p[1],
                        'thumbnail': p[1],
                        'published': p[0]
                    })
                else:
                    temp.append({
                        'title': '第' + p[1] + '页',
                        'link': p[0],
                        'category': p[0],
                        'description': p[1],
                        'thumbnail': p[1],
                        'published': p[0]
                    })

        return temp
Пример #21
0
 def get_bangumi_detail(self, season_id):
     url = BANGUMI_SEASON_URL.format(season_id)
     result = utils.get_page_content(url)
     if result[0] != '{':
         start = result.find('(') + 1
         end = result.find(');')
         result = result[start:end]
     result = json.loads(result)
     return result['result']
Пример #22
0
 def get_av_list_detail(self, aid, page = 1, fav = 0, pagesize = 10):
     params = {'id': aid, 'page': page}
     if fav != 0:
         params['fav'] = fav
     url = VIEW_URL.format(self.api_sign(params))
     result = json.loads(utils.get_page_content(url))
     results = [result]
     if (int(page) < result['pages']) and (pagesize > 1):
         results += self.get_av_list_detail(aid, int(page) + 1, fav, pagesize = pagesize - 1)[0]
     return results, result['pages']
Пример #23
0
 def get_history(self, page=1, pagesize=10):
     if self.is_login == False:
         return []
     url = HISTORY_URL.format(page, pagesize)
     result = json.loads(utils.get_page_content(url))
     if len(result['data']) >= int(pagesize):
         total_page = int(page) + 1
     else:
         total_page = int(page)
     return result['data'], total_page
Пример #24
0
 def get_history(self, page = 1, pagesize = 10):
     if self.is_login == False:
         return []
     url = HISTORY_URL.format(page, pagesize)
     result = json.loads(utils.get_page_content(url))
     if len(result['data']) >= int(pagesize):
         total_page = int(page) + 1
     else:
         total_page = int(page)
     return result['data'], total_page
Пример #25
0
    def get_cat_items(self, category):
        self._print_info('Getting CAT Items')
        self._print_info(category)
        cat_url = self._get_cat_url(category)
        self._print_info(cat_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('CAT Items fetched succeeded!')

        html = utils.get_page_content(cat_url)
        attrs = re.compile(
            '<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>'
        ).findall(html)

        temp = [{
            'title': i[3],
            'description': i[3],
            'link': '',
            'category': category,
            'thumbnail': i[1],
            'published': i[0]
        } for i in attrs]

        #for t in temp:
        #    for tt in t:
        #        self._print_info(t[tt])

        pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(
            html)

        if len(pager):
            links = re.compile('href="/video/(.+?)">(.+?)</a>').findall(
                pager[0])

            for p in links:
                if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[
                        1] == '上页 ':
                    temp.append({
                        'title': p[1],
                        'link': p[0],
                        'category': p[0],
                        'description': p[1],
                        'thumbnail': p[1],
                        'published': p[0]
                    })
                else:
                    temp.append({
                        'title': '第' + p[1] + '页',
                        'link': p[0],
                        'category': p[0],
                        'description': p[1],
                        'thumbnail': p[1],
                        'published': p[0]
                    })

        return temp
Пример #26
0
    def get_category_from_web_page(self):
        category_dict = {'0': {'title': u'全部', 'url': HOME_URL, 'subs':[]}}
        node = category_dict['0']
        url = node['url']
        result = BeautifulSoup(utils.get_page_content(url), "html.parser").findAll('li', {'class': 'm-i'})
        for item in result:
            if len(item['class']) != 1:
                continue
            tid = item['data-tid']
            title = item.em.contents[0]
            url = 'http:' + item.a['href']
            category_dict[tid] = {'title': title, 'url': url, 'subs':[]}
            node['subs'].append(tid)

        #Fix video and movie
        if '11' not in category_dict['0']['subs']:
            category_dict['0']['subs'].append('11')
        if '23' not in category_dict['0']['subs']:
            category_dict['0']['subs'].append('23')
        category_dict['11'] = {'title': u'电视剧', 'url': 'http://bangumi.bilibili.com/tv/', 'subs': []}
        category_dict['23'] = {'title': u'电影', 'url': 'http://bangumi.bilibili.com/movie/', 'subs': []}

        for sub in category_dict['0']['subs']:
            node = category_dict[sub]
            url = node['url']
            result = BeautifulSoup(utils.get_page_content(url), "html.parser").select('ul.n_num li')
            for item in result[1:]:
                if not item.has_attr('tid'):
                    continue
                if not hasattr(item, 'a'):
                    continue
                if item.has_attr('class'):
                    continue
                tid = item['tid']
                title = item.a.contents[0]
                if item.a['href'][:2] == '//':
                    url = 'http:' + item.a['href']
                else:
                    url = HOME_URL + item.a['href']
                category_dict[tid] = {'title': title, 'url': url, 'subs':[]}
                node['subs'].append(tid)
        return category_dict
Пример #27
0
 def get_category_list(self, tid = 0, order = 'default', days = 30, page = 1, pagesize = 10):
     params = {'tid': tid, 'order': order, 'days': days, 'page': page, 'pagesize': pagesize}
     url = LIST_URL.format(self.api_sign(params))
     result = json.loads(utils.get_page_content(url))
     results = []
     for i in range(pagesize):
         if result['list'].has_key(str(i)):
             results.append(result['list'][str(i)])
         else:
             break
     return results, result['pages']
Пример #28
0
 def get_encryped_pwd(self, pwd):
     import rsa
     result = json.loads(utils.get_page_content(LOGIN_HASH_URL.format(random.random()),
                                                headers={'Referer':'https://passport.bilibili.com/login'}))
     pwd = result['hash'] + pwd
     key = result['key']
     pub_key = rsa.PublicKey.load_pkcs1_openssl_pem(key)
     pwd = rsa.encrypt(pwd.encode('utf-8'), pub_key)
     pwd = base64.b64encode(pwd)
     pwd = urllib.quote(pwd)
     return pwd
Пример #29
0
 def get_video_urls(self, cid):
     m = hashlib.md5()
     m.update(INTERFACE_PARAMS.format(str(cid), SECRETKEY_MINILOADER))
     url = INTERFACE_URL.format(str(cid), m.hexdigest())
     doc = minidom.parseString(utils.get_page_content(url))
     urls = [durl.getElementsByTagName('url')[0].firstChild.nodeValue for durl in doc.getElementsByTagName('durl')]
     urls = [url
             if not re.match(r'.*\.qqvideo\.tc\.qq\.com', url)
             else re.sub(r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', url)
             for url in urls]
     return urls
Пример #30
0
    def get_video_parts2(self,category,video,part):
        # 多个部分用这个函数处理 index 开始
        # 
        self._print_info('Getting Video Parts')
        self._print_info(video)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('Parts fetched succeeded!')
  



        video_urls=[]


        title=part
        description=part
        thumbnail=video
        id=video


        url=urllib.urlencode({'kw':'http://www.bilibili.com'+part})
       

        p_url='http://www.flvcd.com/parse.php?format=&'+url
        html2 = utils.get_page_content(p_url)
        html2=html2.decode('gbk').encode('utf-8')
        

        attrs2 = re.compile('<input type="hidden" name="(.+?)" value="(.*?)"').findall(html2)




        filename=""
        inf=""
        for i in (attrs2):
            if i[0]=="filename":
                filename=i[1]
            if i[0]=="inf":
                inf=i[1]

        link=inf

                

        video_urls.append({
            'title': '播放',
            'link': link,
            'category':category,
            'description': description,
            'thumbnail':thumbnail,
            'published': id})

        return video_urls
Пример #31
0
    def get_cat_items(self, category):
        self._print_info('Getting CAT Items')
        self._print_info(category)
        cat_url = self._get_cat_url(category)
        self._print_info(cat_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('CAT Items fetched succeeded!')
 
        html= utils.get_page_content(cat_url)
        attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview" title="(.+?)"><img src="(.+?)" alt="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title" title="(.+?)">(.+?)</a>').findall(html)
        

        #<a href="/video/av2315815/" target="_blank" class="preview" title="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"><img src="http://i2.hdslb.com/320_180/video/0f/0f3662fcc909ad31da2963108ea4d9f6.jpg" alt="【犯罪】【1990】极道之妻 最后的战争 主演: 岩下志麻 导演: 山下耕作"></a>

        temp= [{
            'title': i[1],
            'description': i[1],
            'link':'',
            'category':category,
            'thumbnail':i[2],
            'published': i[0]} for i in attrs]

        #for t in temp:
        #    for tt in t:
        #        self._print_info(t[tt])



        pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(html)

        if len(pager):
            links = re.compile('href="/video/(.+?)">(.+?)</a>').findall(pager[0])


            for p in links:
                if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[1] == '上页 ':
                    temp.append({
                        'title': p[1],
                        'link': p[0],
                        'category':p[0],
                        'description': p[1],
                        'thumbnail':p[1],
                        'published': p[0]})
                else:
                    temp.append({
                        'title': '第'+p[1]+'页',
                        'link': p[0],
                        'category':p[0],
                        'description': p[1],
                        'thumbnail':p[1],
                        'published': p[0]})

        return temp
Пример #32
0
 def get_encryped_pwd(self, pwd):
     import rsa
     result = json.loads(
         utils.get_page_content(
             LOGIN_HASH_URL.format(random.random()),
             headers={'Referer': 'https://passport.bilibili.com/login'}))
     pwd = result['hash'] + pwd
     key = result['key']
     pub_key = rsa.PublicKey.load_pkcs1_openssl_pem(key)
     pwd = rsa.encrypt(pwd.encode('utf-8'), pub_key)
     pwd = base64.b64encode(pwd)
     pwd = urllib.quote(pwd)
     return pwd
Пример #33
0
 def get_av_list_detail(self, aid, page=1, fav=0, pagesize=10):
     params = {'id': aid, 'page': page}
     if fav != 0:
         params['fav'] = fav
     url = VIEW_URL.format(self.api_sign(params))
     result = json.loads(utils.get_page_content(url))
     results = [result]
     if (int(page) < result['pages']) and (pagesize > 1):
         results += self.get_av_list_detail(aid,
                                            int(page) + 1,
                                            fav,
                                            pagesize=pagesize - 1)[0]
     return results, result['pages']
Пример #34
0
    def get_video_parts(self, category, video, part):
        self._print_info('Getting Video Parts')
        self._print_info(video)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('Parts fetched succeeded!')

        video_urls = []

        title = part
        description = part
        thumbnail = video
        id = video

        #url=urllib.urlencode(video)

        #p_url='http://www.flvcd.com/parse.php?format=&'+url
        p_url = 'http://api.xinfan.tv:9999/vids/' + video
        videourl = utils.get_page_content(p_url)

        self._print_info(videourl)
        decodejson = {}
        if videourl:
            decodejson = json.loads(videourl)

        self._print_info('p_url')
        self._print_info(p_url)
        self._print_info(videourl)

        if decodejson['status']:
            for url in decodejson['urls']:
                video_urls.append({
                    'title': '直接播放',
                    'link': url,
                    'category': category,
                    'description': description,
                    'thumbnail': thumbnail,
                    'published': id
                })
        else:
            video_urls.append({
                'title': '无法播放',
                'link': videourl,
                'category': category,
                'description': description,
                'thumbnail': thumbnail,
                'published': id
            })

        return video_urls
Пример #35
0
 def _get_search_items(self, keyword):
     search_url = r'http://www.bilibili.tv/search?keyword='+keyword+'&pagesize=500'
     pickle_file = tempfile.gettempdir() + '/' + keyword + '_tmp.pickle'
     if os.path.exists(pickle_file) and not self._need_rebuild(pickle_file):
         return pickle.load(open(pickle_file, 'rb'))
     else:
         page_content = utils.get_page_content(search_url)
         r = self.SEARCH.findall(page_content)
         results = []
         for li,na in r:
             na  = self.NOTAG.sub('',na)
             results.append((li,na))
         word_file = open(pickle_file, 'wb')
         pickle.dump(results, word_file)
     return results
Пример #36
0
 def get_video_urls(self, cid):
     m = hashlib.md5()
     m.update(INTERFACE_PARAMS.format(str(cid), SECRETKEY_MINILOADER))
     url = INTERFACE_URL.format(str(cid), m.hexdigest())
     doc = minidom.parseString(utils.get_page_content(url))
     urls = [
         durl.getElementsByTagName('url')[0].firstChild.nodeValue
         for durl in doc.getElementsByTagName('durl')
     ]
     urls = [
         url if not re.match(r'.*\.qqvideo\.tc\.qq\.com', url) else re.sub(
             r'.*\.qqvideo\.tc\.qq\.com', 'http://vsrc.store.qq.com', url)
         for url in urls
     ]
     return urls
Пример #37
0
def get_video_urls(cid):
    interface_full_url = INTERFACE_URL.format(str(cid))
    print_info('Interface url: ' + interface_full_url)
    # 解析RSS页面
    content = utils.get_page_content(interface_full_url)
    doc = minidom.parseString(content)
    parts = doc.getElementsByTagName('durl')
    print_info('Video parts found: ' + str(len(parts)))
    result = []
    # 找出所有视频地址
    for part in parts:
        urls = part.getElementsByTagName('url')
        if len(urls) > 0:
            result.append(urls[0].firstChild.nodeValue)
    return result
Пример #38
0
 def login(self, userid, pwd, captcha):
     #utils.get_page_content('http://www.bilibili.com')
     if self.is_login == True:
         return True, ''
     pwd = self.get_encryped_pwd(pwd)
     data = 'cType=2&vcType=1&captcha={}&user={}&pwd={}&keep=true&gourl=http://www.bilibili.com/'.format(captcha, userid, pwd)
     result = utils.get_page_content(LOGIN_URL, data,
                                     {'Origin':'https://passport.bilibili.com',
                                      'Referer':'https://passport.bilibili.com/login'})
     if not requests.utils.dict_from_cookiejar(self.cj).has_key('DedeUserID'):
         return False, LOGIN_ERROR_MAP[json.loads(result)['code']]
     self.cj.save()
     self.is_login = True
     self.mid = str(requests.utils.dict_from_cookiejar(self.cj)['DedeUserID'])
     return True, ''
Пример #39
0
        def thread_func():
            for index, link in enumerate(links):
                if self._stop_event.is_set():
                    return

                file_name, start_identifier = link.href.split('#')

                next_file_name, end_identifier = links[index + 1].href.split(
                    '#') if index + 1 < len(links) else [None, None]

                content = book_file.get_item_with_href(file_name).get_content()
                page_content = None

                if next_file_name == file_name:
                    page_content = get_page_content(content, start_identifier,
                                                    end_identifier)
                else:
                    page_content = get_page_content(content, start_identifier,
                                                    None)

                parsed_content = parse_content(page_content)

                page = Page(parsed_content)
                self._add_page(page, index)
Пример #40
0
    def get_hotjson_items(self,type, category):
        self._print_info('Getting HOT CAT JSON Items')
        self._print_info(category)
        json_url = 'http://www.bilibili.com/index/rank/all-'+type+'-'+category+'.json'
        self._print_info(json_url) 
        self._print_info('HOT CAT Items fetched succeeded!')
 
        html= utils.get_page_content(json_url)


        data=json.loads(html)
        data=(data['rank']['list'])


        temp=[]

    

        for i in range(0,len(data)):
            #pprint(data[i])
            aid= 'av'+str(data[i]['aid'])
            title=(data[i]['title'])
            author=(data[i]['author'])
            description=(data[i]['description'])
            mid=(data[i]['mid'])
            pic=(data[i]['pic'])
            link= 'http://www.bilibili.com/video/'+str(aid)

            temp.append({
            'title': title,
            'link': link,
            'category':category,
            'description':description,
            'thumbnail':pic,
            'published': aid })


            #temp.append({
            #    'title': title,
            #    'link': 'http://www.bilibili.com/video/av'+aid,
            #    'category':category,
            #    'description':description,
            #    'thumbnail':pic,
            #    'published': 'av'+aid })



        return temp
Пример #41
0
    def get_video_paths(self,category,video):
        self._print_info('Getting Video Urls')
  



        video_urls=[]


        title=video
        description=video
        thumbnail=video
        id=video

        p_url='http://www.bilibili.com/video/'+id
        html = utils.get_page_content(p_url)


        self._print_info(p_url)
        self._print_info('Video url fetched succeeded!')


        attrs = re.compile("<option value='(.+?)'>(.+?)</option>").findall(html)

        parts =False

        for i in attrs:
            parts=True
            video_urls.append({
                'title': i[1],
                'link': i[0],
                'part': i[0],
                'category':category,
                'description': i[1],
                'thumbnail':i[1],
                'published': id})


        if parts:
            self._print_info('has parts')
        else:


            self._print_info('no  parts')


        self._print_info('End of fetch')
        return video_urls
Пример #42
0
 def _get_index_items_from_web(self, url):
     page_content = utils.get_page_content(url)
     results_dict = dict()
     results_month_dict = dict()
     parts = page_content.split('<h3>')
     for part in parts:
         results = self.ITEMS.findall(part)
         key = part[0]
         results_dict[key] = []
         for r in results:
             results_dict[key].append((r[1], r[2], r[0]))
             if r[0] in results_month_dict.keys():
                 results_month_dict[r[0]].append((r[1], r[2]))
             else:
                 results_month_dict[r[0]] = [(r[1], r[2])]
     return results_dict, results_month_dict
Пример #43
0
 def _get_index_items_from_web(self, url):
     page_content = utils.get_page_content(url)
     results_dict = dict()
     results_month_dict = dict()
     parts = page_content.split('<h3>')
     for part in parts:
         results = self.ITEMS.findall(part)
         key = part[0]
         results_dict[key] = []
         for r in results:
             results_dict[key].append((r[1], r[2], r[0]))
             if r[0] in results_month_dict.keys():
                 results_month_dict[r[0]].append((r[1], r[2]))
             else:
                 results_month_dict[r[0]] = [(r[1], r[2])]
     return results_dict, results_month_dict
Пример #44
0
def get_subtitle(cid):
    url = COMMENT_URL.format(cid)
    print_info('Page full url: ' + url)
    input = get_tmp_dir() + '/tmp.xml'
    output = get_tmp_dir() + '/tmp.ass'

    local_file = open(input, "w")
    local_file.write(utils.get_page_content(url))
    local_file.close()

    Danmaku2ASS(input, output, WIDTH, HEIGHT,
        font_size=FONT_SIZE,
        text_opacity=TEXT_OPACITY,
        is_reduce_comments=IS_REDUCE_COMMENTS,
        duration_marquee=DURATION_MARQUEE,
        duration_still=DURATION_STILL
    )
    return output
Пример #45
0
    def get_video_parts2(self, category, video, part):
        # 多个部分用这个函数处理 index 开始
        #
        self._print_info('Getting Video Parts')
        self._print_info(video)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('Parts fetched succeeded!')

        video_urls = []

        title = part
        description = part
        thumbnail = video
        id = video

        url = urllib.urlencode({'kw': 'http://www.bilibili.com' + part})

        p_url = 'http://www.flvcd.com/parse.php?format=&' + url
        html2 = utils.get_page_content(p_url)
        html2 = html2.decode('gbk').encode('utf-8')

        attrs2 = re.compile(
            '<input type="hidden" name="(.+?)" value="(.*?)"').findall(html2)

        filename = ""
        inf = ""
        for i in (attrs2):
            if i[0] == "filename":
                filename = i[1]
            if i[0] == "inf":
                inf = i[1]

        link = inf

        video_urls.append({
            'title': '播放',
            'link': link,
            'category': category,
            'description': description,
            'thumbnail': thumbnail,
            'published': id
        })

        return video_urls
Пример #46
0
    def get_anime_series(self,anime):
        self._print_info('Getting ANIME Item')
        anime_url = self.BASE_URL+anime
        self._print_info(anime_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('ANIME Item fetched succeeded!')
 
        html= utils.get_page_content(anime_url)
        #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html)
        temp=[]
    

        html = html.replace('\r', '')
        html = html.replace('\n', '')

        spid = re.compile('var spid = "(.+?)";').findall(html)
        series = re.compile('<option value="(.+?)">(.+?)</option>').findall(html)

        if len(spid):
            spid=spid[0]
            self._print_info('spid  '+spid)

        if len(series):
            for s in series:
                self._print_info('title'+s[1])
                self._print_info('series number '+s[0])
                link=spid+'-'+s[0]
                #http://www.bilibili.com/sppage/bangumi-13294-1816-1.html
                temp.append({
                    'title': s[1],
                    'link': link,
                    'spid': spid,
                    'seriesid': s[1],
                    'type': 'series',
                    'thumbnail':s[0],
                    'published': s[0]})

        else:
            temp=self.get_next_page(temp,spid,1)




        return temp
Пример #47
0
    def get_video_paths(self, category, video):
        self._print_info('Getting Video Urls')

        video_urls = []

        title = video
        description = video
        thumbnail = video
        id = video

        p_url = 'http://www.bilibili.com/video/' + id
        html = utils.get_page_content(p_url)

        self._print_info(p_url)
        self._print_info('Video url fetched succeeded!')

        attrs = re.compile("<option value='(.+?)'>(.+?)</option>").findall(
            html)

        parts = False

        for i in attrs:
            parts = True
            url = 'http://www.bilibili.com' + i[0]
            self._print_info(url)
            video_urls.append({
                'title': i[1],
                'link': i[0],
                'part': i[0],
                'category': category,
                'description': i[1],
                'thumbnail': i[1],
                'published': url
            })

        if parts:
            self._print_info('has parts')
        else:
            self._print_info('no  parts')

        self._print_info('End of fetch')
        return video_urls
Пример #48
0
    def get_anime_series(self, anime):
        self._print_info('Getting ANIME Item')
        anime_url = self.BASE_URL + anime
        self._print_info(anime_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('ANIME Item fetched succeeded!')

        html = utils.get_page_content(anime_url)
        #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html)
        temp = []

        html = html.replace('\r', '')
        html = html.replace('\n', '')

        spid = re.compile('var spid = "(.+?)";').findall(html)
        series = re.compile('<option value="(.+?)">(.+?)</option>').findall(
            html)

        if len(spid):
            spid = spid[0]
            self._print_info('spid  ' + spid)

        if len(series):
            for s in series:
                self._print_info('title' + s[1])
                self._print_info('series number ' + s[0])
                link = spid + '-' + s[0]
                #http://www.bilibili.com/sppage/bangumi-13294-1816-1.html
                temp.append({
                    'title': s[1],
                    'link': link,
                    'spid': spid,
                    'seriesid': s[1],
                    'type': 'series',
                    'thumbnail': s[0],
                    'published': s[0]
                })

        else:
            temp = self.get_next_page(temp, spid, 1)

        return temp
Пример #49
0
    def get_hotjson_items(self, type, category):
        self._print_info('Getting HOT CAT JSON Items')
        self._print_info(category)
        json_url = 'http://www.bilibili.com/index/rank/all-' + type + '-' + category + '.json'
        self._print_info(json_url)
        self._print_info('HOT CAT Items fetched succeeded!')

        html = utils.get_page_content(json_url)

        data = json.loads(html)
        data = (data['rank']['list'])

        temp = []

        for i in range(0, len(data)):
            #pprint(data[i])
            aid = 'av' + str(data[i]['aid'])
            title = (data[i]['title'])
            author = (data[i]['author'])
            description = (data[i]['description'])
            mid = (data[i]['mid'])
            pic = (data[i]['pic'])
            link = 'http://www.bilibili.com/video/' + str(aid)

            temp.append({
                'title': title,
                'link': link,
                'category': category,
                'description': description,
                'thumbnail': pic,
                'published': aid
            })

            #temp.append({
            #    'title': title,
            #    'link': 'http://www.bilibili.com/video/av'+aid,
            #    'category':category,
            #    'description':description,
            #    'thumbnail':pic,
            #    'published': 'av'+aid })

        return temp
Пример #50
0
 def login(self, userid, pwd, captcha):
     #utils.get_page_content('http://www.bilibili.com')
     if self.is_login == True:
         return True, ''
     pwd = self.get_encryped_pwd(pwd)
     data = 'cType=2&vcType=1&captcha={}&user={}&pwd={}&keep=true&gourl=http://www.bilibili.com/'.format(
         captcha, userid, pwd)
     result = utils.get_page_content(
         LOGIN_URL, data, {
             'Origin': 'https://passport.bilibili.com',
             'Referer': 'https://passport.bilibili.com/login'
         })
     if not requests.utils.dict_from_cookiejar(
             self.cj).has_key('DedeUserID'):
         return False, LOGIN_ERROR_MAP[json.loads(result)['code']]
     self.cj.save()
     self.is_login = True
     self.mid = str(
         requests.utils.dict_from_cookiejar(self.cj)['DedeUserID'])
     return True, ''
Пример #51
0
    def get_tag_category(self, cat_id):
        url = "http://www.bilibili.com/index/catalog_tags.json"
        html = utils.get_page_content(url)
        jsondata = json.loads(html)
        self._print_info('get_tag_category')
        items = []
        for i in jsondata:
            if i == cat_id:
                _jsondata = jsondata[i]
                for j in _jsondata:

                    items.append({
                        'label':
                        j,
                        'thumbnail':
                        j,
                        'path':
                        json.dumps(j, ensure_ascii=False).encode('utf8')
                    })
        return items
Пример #52
0
 def get_category_list(self,
                       tid=0,
                       order='default',
                       days=30,
                       page=1,
                       pagesize=10):
     params = {
         'tid': tid,
         'order': order,
         'days': days,
         'page': page,
         'pagesize': pagesize
     }
     url = LIST_URL.format(self.api_sign(params))
     result = json.loads(utils.get_page_content(url))
     results = []
     for i in range(pagesize):
         if result['list'].has_key(str(i)):
             results.append(result['list'][str(i)])
         else:
             break
     return results, result['pages']
Пример #53
0
    def get_next_page(self, list, bangumi, page):
        p_url = 'http://www.bilibili.com/sppage/bangumi-' + str(
            bangumi) + '-' + str(page) + '.html'
        html = utils.get_page_content(p_url)

        html = html.replace('\r', '')
        html = html.replace('\n', '')
        checknext = re.compile('<div class="no_more">(.+?)</div>').findall(
            html)
        if len(checknext):
            if checknext[0] == "没有更多信息":
                return list

        else:
            thumbnail = ""
            cover = re.compile(
                '<div class="cover"><img src="(.+?)"></div>').findall(html)
            if len(cover):
                thumbnail = cover[0]

            series = re.compile(
                '<a class="t" href="/video/(.+?)" target="_blank" title="(.+?)">(.+?)</a>'
            ).findall(html)

            if len(series):
                for s in series:
                    list.append({
                        'title': s[1].strip(),
                        'link': s[0].strip(),
                        'type': 'bangumi',
                        'page': page,
                        'thumbnail': thumbnail,
                        'published': s[0].strip()
                    })

            return self.get_next_page(list, bangumi, page + 1)
Пример #54
0
    def get_anime_list(self, list):
        self._print_info('Getting ANIME LIST')
        anime_url = self.BASE_URL + list
        self._print_info(anime_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('ANIME LIST fetched succeeded!')

        html = utils.get_page_content(anime_url)
        #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html)
        temp = []
        #temp= [{
        #    'title': i[3],
        #    'description': i[3],
        #    'link':'',
        #    'category':category,
        #    'thumbnail':i[1],
        #    'published': i[0]} for i in attrs]

        #for t in temp:
        #    for tt in t:
        #        self._print_info(t[tt])

        html = html.replace('\r', '')
        html = html.replace('\n', '')

        anime = re.compile('<ul class="v_ul">(.+?)</ul>').findall(html)

        if len(anime):
            anime = anime[0].replace('\r', '')
            anime = anime.replace('\n', '')
            anime = re.compile('<li>(.+?)</li>').findall(anime)

            for _anime in anime:
                #print _anime
                #_anime=_anime.replace(' ', '')
                #print _anime
                cover = re.compile('<div class="cover">(.+?)</div>').findall(
                    _anime)
                info_wrp = re.compile(
                    '<div class="info_wrp">(.+?)</div>').findall(_anime)
                info_series = re.compile('<p class="num">(.+?)</p>').findall(
                    _anime)

                image = ""
                link = ""
                title = ""

                if len(cover):
                    _cover = re.compile(
                        '<a href="(.+?)" target="_blank"><img src="(.+?)" /></a>'
                    ).findall(cover[0])
                    if len(_cover):
                        image = _cover[0][1]
                        link = _cover[0][0]

                if len(info_wrp):
                    _info_wrp = re.compile(
                        '<a title="(.+?)" href="(.+?)" target="_blank">(.+?)</a>'
                    ).findall(info_wrp[0])
                    if len(_info_wrp):
                        title = _info_wrp[0][0]

                if len(info_series):
                    info_series = info_series[0].replace('<b>', '')
                    info_series = info_series.replace('</b>', '')
                    title = title + "  " + info_series

                temp.append({
                    'title': title,
                    'link': link,
                    'type': 'sp',
                    'thumbnail': image,
                    'published': link
                })

        pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(
            html)

        if len(pager):
            links = re.compile('href="(.+?)">(.+?)</a>').findall(pager[0])

            for p in links:
                if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[
                        1] == '上页 ':
                    temp.append({
                        'title': p[1],
                        'link': p[0],
                        'type': 'list',
                        'thumbnail': p[1],
                        'published': p[0]
                    })
                else:
                    temp.append({
                        'title': '第' + p[1] + '页',
                        'link': p[0],
                        'type': 'list',
                        'thumbnail': p[1],
                        'published': p[0]
                    })

        return temp
Пример #55
0
    def get_tag_videos(self, cat_id, tagname, page=1):
        print tagname
        tagname = tagname[1:-1]
        print tagname

        print tagname
        url = "http://www.bilibili.com/index/tag/" + cat_id + "/default/" + page + "/" + tagname + ".json"
        print url
        html = utils.get_page_content(url)
        jsondata = json.loads(html)
        self._print_info('get_tag_videos')
        print jsondata
        items = []
        item = 0
        for i in jsondata:
            _jsondata = jsondata[i]
            if type(_jsondata) is list:
                for j in _jsondata:

                    items.append({
                        'label': j['title'],
                        'thumbnail': j['pic'],
                        'type': 'video',
                        'path': 'av' + j['aid']
                    })
                    item = item + 1
            else:
                if i == 'results':
                    results = _jsondata
                if i == 'num':
                    num = _jsondata
                if i == 'pages':
                    self._print_info('page: ' + page)
                    pages = _jsondata
                    items.append({
                        'label': '首页',
                        'thumbnail': '首页',
                        'type': 'pager',
                        'path': 1
                    })
                    if int(page) < int(pages):
                        items.append({
                            'label': '下一页',
                            'thumbnail': '下一页',
                            'type': 'pager',
                            'path': (int(page) + 1)
                        })
                    if int(page) > 1:
                        items.append({
                            'label': '上一页',
                            'thumbnail': '上一页',
                            'type': 'pager',
                            'path': (int(page) - 1)
                        })
                    items.append({
                        'label': '末页',
                        'thumbnail': '末页',
                        'type': 'pager',
                        'path': int(pages)
                    })

        return items
category_re['en'] = re.compile(r'\[\[Category:(.+?)(?:\|.*?)?\]\]')
category_re['fr'] = re.compile(r'\[\[Cat\xe9gorie:(.+?)\]\]')

for rg_id, rg_gid, rg_name, ac_name, rg_sec_types, processed in db.execute(query, query_params):
    colored_out(bcolors.OKBLUE, 'Looking up release group "%s" http://musicbrainz.org/release-group/%s' % (rg_name, rg_gid))
    matches = wps.query(escape_query(rg_name), defType='dismax', qf='name', rows=100).results
    last_wp_request = time.time()
    for match in matches:
        title = match['name']
        if mangle_name(re.sub(' \(.+\)$', '', title)) != mangle_name(rg_name) and mangle_name(title) != mangle_name(rg_name):
            continue
        delay = time.time() - last_wp_request
        if delay < 1.0:
            time.sleep(1.0 - delay)
        last_wp_request = time.time()
        page_orig = get_page_content(wp, title, wp_lang)
        if not page_orig:
            continue
        page_title = title
        url = 'http://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),)
        colored_out(bcolors.HEADER, ' * trying article %s' % (title,))
        page = mangle_name(page_orig)

        is_canonical, reason = wp_is_canonical_page(title, page_orig)
        if (not is_canonical):
            out(' * %s, skipping' % reason)
            continue

        categories = category_re[wp_lang].findall(page_orig)
        is_album_page = False
        for category in categories:
Пример #57
0
 def fetch(cls, url, use_cache=True):
     m = re.match(r'^http://([a-z]{2})\.wikipedia\.org', url)
     page_lang = m.group(1).encode('utf8')
     page_title = extract_page_title(url, page_lang)
     wp = MediaWiki('http://%s.wikipedia.org/w/api.php' % page_lang)
     return cls(page_title, get_page_content(wp, page_title, page_lang, use_cache) or '', page_lang)
Пример #58
0
    def get_anime_list(self,list):
        self._print_info('Getting ANIME LIST')
        anime_url = self.BASE_URL + list
        self._print_info(anime_url)
        #parse_result = feedparser.parse(cat_url)
        self._print_info('ANIME LIST fetched succeeded!')
 
        html= utils.get_page_content(anime_url)
        #attrs = re.compile('<div class="l-item"><a href="/video/(.+?)/" target="_blank" class="preview"><img src="(.+?)"></a><a href="/video/(.+?)/" target="_blank" class="title">(.+?)</a>').findall(html)
        temp=[]
        #temp= [{
        #    'title': i[3],
        #    'description': i[3],
        #    'link':'',
        #    'category':category,
        #    'thumbnail':i[1],
        #    'published': i[0]} for i in attrs]

        #for t in temp:
        #    for tt in t:
        #        self._print_info(t[tt])

        html = html.replace('\r', '')
        html = html.replace('\n', '')

        anime = re.compile('<ul class="v_ul">(.+?)</ul>').findall(html)



        if len(anime):
            anime = anime[0].replace('\r', '')
            anime = anime.replace('\n', '')
            anime = re.compile('<li>(.+?)</li>').findall(anime)


            for _anime in anime:
                #print _anime
                #_anime=_anime.replace(' ', '')
                #print _anime
                cover = re.compile('<div class="cover">(.+?)</div>').findall(_anime)
                info_wrp = re.compile('<div class="info_wrp">(.+?)</div>').findall(_anime)
                info_series = re.compile('<p class="num">(.+?)</p>').findall(_anime)
                
                image=""
                link=""
                title=""
                
                if len(cover):
                    _cover = re.compile('<a href="(.+?)" target="_blank"><img src="(.+?)" /></a>').findall(cover[0])
                    if len(_cover):
                        image = _cover[0][1]
                        link = _cover[0][0]

                if len(info_wrp):
                    _info_wrp = re.compile('<a title="(.+?)" href="(.+?)" target="_blank">(.+?)</a>').findall(info_wrp[0])
                    if len(_info_wrp):
                        title=_info_wrp[0][0]

                if len(info_series):
                    info_series=info_series[0].replace('<b>', '')
                    info_series=info_series.replace('</b>', '')
                    title=title+"  "+info_series

                temp.append({
                    'title': title,
                    'link': link,
                    'type': 'sp',
                    'thumbnail':image,
                    'published': link})

        

        pager = re.compile('<div class="pagelistbox">(.+?)</div>').findall(html)

        if len(pager):
            links = re.compile('href="(.+?)">(.+?)</a>').findall(pager[0])


            for p in links:
                if p[1] == '下页' or p[1] == '末页' or p[1] == '首页 ' or p[1] == '上页 ':
                    temp.append({
                        'title': p[1],
                        'link': p[0],
                        'type': 'list',
                        'thumbnail':p[1],
                        'published': p[0]})
                else:
                    temp.append({
                        'title': '第'+p[1]+'页',
                        'link': p[0],
                        'type': 'list',
                        'thumbnail':p[1],
                        'published': p[0]})

        return temp
Пример #59
0
 def add_history(self, aid, cid):
     url = ADD_HISTORY_URL.format(str(cid), str(aid))
     utils.get_page_content(url)
Пример #60
0
 def add_history(self, aid, cid):
     url = ADD_HISTORY_URL.format(str(cid), str(aid))
     utils.get_page_content(url)