def video_from_vid(self, vid, **kwargs): handlers = [HTTPCookieProcessor()] install_opener(build_opener(*handlers)) did, tk2 = self.generate_did_tk2() api_info_url = 'https://pcweb.api.mgtv.com/player/video?tk2={}&video_id={}&type=pch5'.format(tk2, vid) html = get_html(api_info_url) content = loads(html) title = content['data']['info']['title'] pm2 = content['data']['atc']['pm2'] api_source_url = 'https://pcweb.api.mgtv.com/player/getSource?video_id={}&pm2={}&tk2={}&type=pch5'.format(vid, pm2, tk2) html = get_html(api_source_url) content = loads(html) streams = content['data']['stream'] domains = content['data']['stream_domain'] index = randrange(len(domains)) domain = domains[index] level = kwargs.get('level', 0) if level >= 0: level = min(level, len(streams)-1) url = streams[level]['url'] url = domain + url content = loads(get_html(url)) url = content['info'] return re.compile("(.*m3u8)").findall(url)
def main(html_inspire, default_institution): m = re.search(r"/([0-9]+)", html_inspire) if m is None: raise ValueError("not valid html") inspire_number = m.group(1) url = "http://inspirehep.net/record/{0}/export/xn".format(inspire_number) xml = get_html(url) doc = xmltodict.parse(xml) authors = get_authors(doc) print "\n" + "=" * 10 + " ALL AUTHORS " + "=" * 10 authors_list = ", ".join(map(format_name, authors)) print authors_list print "\n found %d authors" % len(authors) milan_authors = [author for author in authors if (default_institution in " ".join(author[2]))] print "\n" + "=" * 10 + (" %s AUTHORS " % default_institution) + "=" * 10 milan_list = "\n".join(map(format_name_italian, milan_authors)) print milan_list print "\n" + "=" * 10 + " TITLE " + "=" * 10 title = get_title(doc) print title print "\n" + "=" * 10 + " ABSTRACT " + "=" * 10 abstract = get_abstract(doc) print abstract print "\n===== KEYWORKDS ======\n" keys = dump_keyword.get_keys_from_html(get_html(html_inspire)) print keys return authors_list, milan_list, title, abstract, keys
def episodelist2(url): plugin.set_content('TVShows') link = get_html(url) tree = BeautifulSoup(link, 'html.parser') listapi = 'http://my.tv.sohu.com/play/getvideolist.do?playlistid=%s&pagesize=30&order=1' match0 = re.compile('playlistId\s*=\s*["|\'](.+?)["|\'];', re.DOTALL).findall(link) link = get_html(listapi % match0[0]) videos = loads(link)['videos'] items = [] for item in videos: length = item['playLength'] p_date = item['publishTime'].encode('utf-8') p_order = int(item['order']) vid = item['vid'] title = item['subName'].encode('utf-8') items.append({ 'label': title, 'path': url_for('playvideo', name=title, url=item['pageUrl'], image=item['largePicUrl']), 'thumbnail': item['largePicUrl'], 'is_playable': True, 'info': {'title': title}, }) return items
def fetch_magic(self, url): magic_list = [] page = get_html(url) src = re.findall(r'src="(.+?)"', page) js = [path for path in src if path.endswith('.js')] host = 'http://' + urlparse(url).netloc js_path = [urljoin(host, rel_path) for rel_path in js] for p in js_path: if 'mtool' in p or 'mcore' in p: js_text = get_html(p) hit = re.search( r'\(\'(.+?)\',(\d+),(\d+),\'(.+?)\'\.split\(\'\|\'\),\d+,\{\}\)', js_text) code = hit.group(1) base = hit.group(2) size = hit.group(3) names = hit.group(4).split('|') sym_to_name = {} for no in range(int(size), 0, -1): no_in_base = mapping(no, int(base)) val = names[no] if no < len( names) and names[no] else no_in_base sym_to_name[no_in_base] = val moz_ec_name = self.search_dict(sym_to_name, 'mozEcName') push = self.search_dict(sym_to_name, 'push') patt = '{}\.{}\("(.+?)"\)'.format(moz_ec_name, push) ec_list = re.findall(patt, code) [magic_list.append(sym_to_name[ec]) for ec in ec_list] return magic_list
def tudou_download_by_iid(self, iid, title, **kwargs): data = loads(get_html('http://www.tudou.com/outplay/goto/getItemSegs.action?iid=%s' % iid)) temp = max([data[i] for i in data if 'size' in data[i][0]], key=lambda x:sum([part['size'] for part in x])) vids, size = [t["k"] for t in temp], sum([t["size"] for t in temp]) urls = [] for vid in vids: for i in parseString(get_html('http://ct.v2.tudou.com/f?id=%s' % vid)).getElementsByTagName('f'): urls.append(i.firstChild.nodeValue.strip()) return urls
def get_post_pages_by_id(item_id: int) -> Pages: """Get Post Pages based on an Item ID.""" pages = [] url = HN_ITEMS_URL + '?id={}'.format(item_id) pg = extract_page(get_html(url)) pages.append(pg) while (pg.has_next): newurl = url + '&p={}'.format(pg.pg_number + 1) pg = extract_page(get_html(newurl)) pages.append(pg) return Pages(pages)
def video_from_vid(self, vid, **kwargs): vparamap = {0: '1300', 1: '720p', 2: '1080p'} url = 'http://player-pc.le.com/mms/out/video/playJson' req = { 'id': vid, 'platid': 1, 'splatid': 105, 'format': 1, 'tkey': self.calcTimeKey(int(time.time())), 'domain': 'www.le.com', 'region': 'cn', 'source': 1000, 'accessyx': 1 } r = get_html(url + '?' + urlencode(req)) info = loads(r) playurl = info['msgs']['playurl'] stream_level = kwargs.get('level', 0) support_stream_id = playurl["dispatch"].keys() stype = len(support_stream_id) stream_level = min(stream_level, stype - 1) stream_id = support_stream_id[stream_level] # pick a random domain index = randrange(len(playurl['domain'])) url = playurl["domain"][index] + playurl["dispatch"][stream_id][0] uuid = hashlib.sha1(url.encode('utf8')).hexdigest() + '_0' url = url.replace('tss=0', 'tss=ios') url += '&m3v=1&termid=1&format=1&hwtype=un&ostype=MacOS10.12.4&p1=1&p2=10&p3=-&expect=3&tn={}&vid={}&uuid={}&sign=letv'.format( random(), vid, uuid) r2 = get_html(url.encode('utf-8')) info2 = loads(r2) # hold on ! more things to do # to decode m3u8 (encoded) suffix = '&r=%d&appid=500' % (int(time.time() * 1000)) m3u8 = get_html(info2['location'] + suffix, decoded=False) if m3u8 is None: return None m3u8_list = self.m3u8decode(m3u8) m3u8_file = kwargs.get('m3u8') with open(m3u8_file, "wb") as m3u8File: m3u8File.write(m3u8_list) m3u8File.close() urls = re.findall(r'^[^#][^\r]*', m3u8_list, re.MULTILINE) return urls
def get_category_from_web_page(self): category_dict = {'0': {'title': u'全部', 'url': HOME_URL}} node = category_dict['0'] url = node['url'] result = BeautifulSoup(get_html(url), "html.parser").findAll('li', {'class': 'm-i'}) for item in result: if len(item['class']) != 1: continue tid = item['data-tid'] title = item.em.contents[0] url = 'http:' + item.a['href'] category_dict[tid] = {'title': title, 'url': url} node['subs'].append(tid) #Fix video and movie if '11' not in category_dict['0']['subs']: category_dict['0']['subs'].append('11') if '23' not in category_dict['0']['subs']: category_dict['0']['subs'].append('23') category_dict['11'] = { 'title': u'电视剧', 'url': 'http://bangumi.bilibili.com/tv/' } category_dict['23'] = { 'title': u'电影', 'url': 'http://bangumi.bilibili.com/movie/' } for sub in category_dict['0']['subs']: node = category_dict[sub] url = node['url'] result = BeautifulSoup(get_html(url), "html.parser").select('ul.n_num li') for item in result[1:]: if not item.has_attr('tid'): continue if not hasattr(item, 'a'): continue if item.has_attr('class'): continue tid = item['tid'] title = item.a.contents[0] if item.a['href'][:2] == '//': url = 'http:' + item.a['href'] else: url = HOME_URL + item.a['href'] category_dict[tid] = {'title': title, 'url': url} node['subs'].append(tid) return category_dict
def get_json(self, api, data=None, pretty=False): headers = self.header s = loads(get_html(SERVER + api, data=data, headers=headers)) if pretty: print headers print dumps(s, sort_keys=True, indent=4, separators=(',', ': ')) return s
def changeList(url): html = get_html(url) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('div', {'class': 'm-tag-type'}) surl = url.split('/') purl = surl[-1].split('-') dialog = xbmcgui.Dialog() filter = '' for iclass in soup: title = iclass.find('h5', {'class': 'u-title'}).text si = iclass.find_all('a') list = [] for subitem in si: list.append(subitem.text) sel = dialog.select(title, list) if sel < 0: continue filter += u'|' + title + u'(' + si[sel].text + u')' seurl = si[sel]['onclick'].split('/')[-1] seurl = seurl.split('-') for i in range(0, len(purl)): if seurl[i] != '': purl[i] = seurl[i] surl[-1] = '-'.join(purl) url = '/'.join(surl) mainlist(url, filter)
def get_dy2018(): url = 'https://item.mi.com/product/6334.html' print('开始抓取AI音箱...') soup = get_soup(get_html(url, 'utf-8')) # print(soup.prettify()) a = soup.select('#J_buyBtnBox a') print(a)
def episodelist(url): plugin.set_content('TVShows') html = get_html(url) playcfg = re.compile('var webcfg\s?=\s?({.+?);\n').findall(html) if playcfg: jsplay = loads(playcfg[0]) else: return [] items = [] content = jsplay['share_content'] for item in jsplay['playList']['data']['list']: vip = '' if int(item['vip']) == 0 else VIP new = NEW if item.get('isNew') else '' items.append({ 'label': item['title'] + vip + new, 'path': url_for('playvideo', vid=item['id'], name=item['title'].encode('utf-8'), image=item['capture'].encode('utf-8')), 'thumbnail': item['capture'], 'is_playable': True, 'info': {'title': item['title']}, }) return items
def playList(url): html = get_html(url) tree = BeautifulSoup(html, 'html.parser') lists = tree.find_all('a', {'class': 'vd-list-item'}) if lists is None: return [] items = [] for item in lists: p_thumb = item.img.get('src') if p_thumb is None: p_thumb = item.img.get('_lazysrc', '') d = item.find('i', {'class': 'vtime'}) duration = 0 for t in d.text.split(':'): duration = duration * 60 + int(t) items.append({ 'label': item['title'], 'path': url_for('playvideo', url=httphead(item['href'])), 'thumbnail': p_thumb, 'is_playable': True, 'info': {'title': item['title'], 'duration': duration} }) return items
def video_from_vid(self, vid, **kwargs): if self.coeff is None: magic_list = self.fetch_magic(self.a_mobile_url) self.coeff = self.get_coeff(magic_list) ep_url = self.video_ep if 'single_video' in kwargs else self.media_ep url = ep_url.format(vid) meta = loads(get_html(url)) streams = meta['playlist'] maxlevel = len(streams) level = kwargs.get('level', 0) if level >= maxlevel: level = maxlevel - 1 stream = streams[level] definition = stream['code'] for s in stream['playinfo']: codec = 'h' + s['codec'][2:] # h.264 -> h264 for st in self.stream_types: s_id = definition if codec == 'h264' else '{}_{}'.format( definition, codec) if s_id == st: clear_info = self.dec_playinfo(s, self.coeff) cdn_list = self.get_cdninfo(clear_info['hashid']) base_url = cdn_list[0] token = base64.b64encode( clear_info['token'].encode('utf8')) video_url = '{}?token={}&vf={}'.format( base_url, token, s['vf']) return [video_url]
def gettudoulist(url): html = get_html(httphead(url)) tree = BeautifulSoup(html, 'html.parser') items = [] soup = tree.find_all('div', {'class': 'td-listbox__list__item--show'}) for item in soup: soup2str = str(item) title = re.compile('title=\"(.+?)\"').findall(soup2str) if not title: title = re.compile('title=\'(.+?)\'').findall(soup2str) thumb = re.compile('src="(.+?)"').findall(soup2str) purl = re.compile(' href="(.+?)"').findall(soup2str) if not (title and thumb and purl): continue items.append({ 'label': title[0], 'path': url_for('playvideo', url=purl[0]), 'is_playable': True, 'thumbnail': thumb[0], 'info': { 'title': title[0] } }) return items
def filter(url): html = get_html(url) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('div', {'class': 'list_nav'}) dialog = xbmcgui.Dialog() urlsplit = url.split('/') urltype = re.compile('\w{1}.+?_').findall(urlsplit[-1]) marktype = [] for item in soup: typelist = item.span.text title = re.sub('\r|\n|\t| ', '', typelist) li = item.findAll('li') sel = dialog.select(title, [x.text for x in li]) if sel >= 0: if 'href' not in li[sel]: li[sel]['href'] = url selurl = li[sel]['href'].split('/') seltype = re.compile('\w{1}.+?_').findall(selurl[-1]) for i in seltype: if i not in urltype: marktype.append(i) u1 = urlsplit[-1] for type in marktype: u1 = re.sub(type[0] + '.+?_', type, u1) urlsplit[-1] = u1 url = '/'.join(urlsplit) return videolist(url=url, page=1)
def getProgramList(channelId): ''' timeUrl = 'http://live-api.xwei.tv/api/getUnixTimestamp' html = get_html(timeUrl) timestamp = json.loads(html)['time'] t = float(timestamp) timestamp = int(t/1000) ''' epgAPI = 'http://live-api.xwei.tv/api/getEPGByChannelTime/%s/0/%d' info = '' try: html = get_html(epgAPI % (channelId, int(time.time()))) results = json.loads(html)['result'][0] for prog in results: start = time.localtime(prog['start_time']) end = time.localtime(prog['end_time']) name = prog['name'] name = name.replace(' ', '') info += '%02d:%02d--' % (start[3], start[4]) info += '%02d:%02d ' % (end[3], end[4]) info += name + '\n' except: pass return info
def get_video_urls(self, url, need_subtitle=True): self._print_info('Getting video address') page_full_url = self.BASE_URL + url self._print_info('Page url: ' + page_full_url) page_content = get_html(page_full_url) self._print_info('Origin page length: ' + str(len(page_content))) return self._parse_urls(page_content, need_subtitle)
def video_from_url(self, url, **kwargs): # Embedded player id = r1(r'.tudou.com/v/([^/]+)/', url) if id: return self.tudou_download_by_id(id, title='') html = get_html(url) try: title = r1(r'\Wkw\s*[:=]\s*[\'\"]([^\n]+?)\'\s*\n', html).replace("\\'", "\'") assert title title = unescape_html(title) except AttributeError: title = match1(html, r'id=\"subtitle\"\s*title\s*=\s*\"([^\"]+)\"') if title is None: title = '' vcode = r1(r'vcode\s*[:=]\s*\'([^\']+)\'', html) if vcode is None: vcode = match1(html, r'viden\s*[:=]\s*\"([\w+/=]+)\"') if vcode: print "vcode", vcode from youku import Youku return Youku().video_from_vid(vcode, **kwargs) iid = r1(r'iid\s*[:=]\s*(\d+)', html) if not iid: return self.tudou_download_playlist(url, **kwargs) else: return self.tudou_download_by_iid(iid, title, **kwargs)
def genlist(start, end): global origin list_page = common.get_html(origin) chapterlist = [] for i in range(start, end + 1): # print(i) if i in list(range(1, 51)) + list(range(52, 505)) + \ list(range(506, 627)): text = '^Chapter %s – .*' % str(i) elif i in [51]: text = '^Chapter %s - .*' % str(i) elif i in [505, 981]: text = '^Chapter %s( |,).*' % str(i) link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split("/")[-1] chapterlist.append(url) text = '^Chapter %s.5( |,).*' % str(i) elif i == 968: text = '^Chapter %s. .*' % str(i) elif i in [1615, 2048, 2288]: continue else: text = '^Chapter %s, .*' % str(i) link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split("/")[-1] chapterlist.append(url) return chapterlist
def login(self, userid, pwd, captcha): #utils.get_html('http://www.bilibili.com') if self.is_login == True: return True, '' pwd = self.get_encryped_pwd(pwd) data = 'cType=2&vcType=1&captcha={}&user={}&pwd={}&keep=true&gourl=http://www.bilibili.com/'.format( captcha, userid, pwd) result = get_html( LOGIN_URL, data, { 'Origin': 'https://passport.bilibili.com', 'Referer': 'https://passport.bilibili.com/login' }) key = None for ck in self.cj: if ck.name == 'DedeUserID': key = ck.value break if key is None: return False, LOGIN_ERROR_MAP[loads(result)['code']] self.cj.save() self.is_login = True self.mid = str(key) return True, ''
def playfound(url, title, pic): items = [] if not url.startswith('http'): return [] link = get_html(url) tvId = r1(r'param\[\'tvid\'\]\s*=\s*"(.+)"', link) vid = r1(r'param\[\'vid\'\]\s*=\s*"(.+)"', link) if tvId is not None and vid is not None: items = [{ 'label': title, 'path': url_for('playvideo', tvId=tvId, vid=vid, title=title, pic=pic), 'is_playable': True, 'info': { 'title': title } }] else: albumId = r1('albumid="(.+?)"', link) if albumId is not None: items = episodelist(albumId, 1) return items
def list_sections(section): if section == "#": return html = get_html(_meijumao + section, headers={'Host': 'www.meijumao.net'}) soup = BeautifulSoup(html, "html.parser") listing = [] is_folder = True for section in soup.find_all("article"): p_title = section.img.get("alt") p_thumb = section.img.get("src") list_item = ListItem(label=p_title, thumbnailImage=p_thumb) list_item.setProperty('fanart_image', p_thumb) url = '{0}?action=list_series&series={1}&seriesname={2}&fanart_image={3}'.format( _url, section.a.get("href"), p_title.encode("utf-8"), p_thumb) listing.append((url, list_item, is_folder)) #pagination will_page = soup.find("ul", attrs={"id": "will_page"}).find_all("li") if len(will_page) > 0: # print will_page[0].get("class"),will_page[0].find("a").get("href") list_item = ListItem(label="上一页") url = '{0}?action=list_sections§ion={1}'.format( _url, will_page[0].find("a").get("href")) listing.append((url, list_item, is_folder)) list_item = ListItem(label="下一页") url = '{0}?action=list_sections§ion={1}'.format( _url, will_page[-1].find("a").get("href")) listing.append((url, list_item, is_folder)) xbmcplugin.addDirectoryItems(_handle, listing, len(listing)) xbmcplugin.endOfDirectory(_handle)
def listType1(albumType, albumId): url = 'http://cache.video.qiyi.com/jp/sdvlst/%d/%d/' % (albumType, albumId) link = get_html(url) data = link[link.find('=') + 1:] json_response = loads(data) items = [] if 'data' not in json_response: return [] for item in json_response['data']: items.append({ 'label': item['videoName'] + item['tvYear'], 'path': url_for('playvideo', tvId=item['tvId'], vid=item['vid'], title=item['videoName'].encode('utf-8'), pic=item['aPicUrl']), 'thumbnail': item['aPicUrl'], 'is_playable': True, 'info': { 'title': item['videoName'] } }) return items
def root(): plugin.set_content('TVShows') # show search entry #yield { # 'label': '[COLOR FF00FFFF]<搜索...>[/COLOR]', # 'path': url_for('search') #} #yield { # 'label': u'全国电视台', # 'path': url_for('tvstudio', url=PPTV_TV_LIST, page=1) #} data = get_html(PPTV_LIST) soup = BeautifulSoup(data, 'html.parser') menu = soup.find_all('div', {'class': 'detail_menu'}) tree = menu[0].find_all('li') for item in tree: url = item.a['href'] t = re.compile('type_(\d+)').findall(url) if len(t) < 1: continue yield { 'label': item.a.text, 'path': url_for('videolist', url=url, page=1) }
def genlist(start, end): global origin list_page = common.get_html(origin) chapterlist = [] for i in range(start, end+1): # print(i) text = '^Chapter %s .*' % str(i) if i in [30, 236, 237, 406, 408, 548, 749, 828]: text = '^Chapter %s' % str(i) elif i == 336: text = '^Chapter 336 – You Lump of Meat!' elif i == 337: text = '^Chapter 336 – The Tyrannical Ye Lai' elif i == 490: text = '^Chapter 490 – The Endless Chase' elif i == 590: text = '^Chapter 490 – The Red Dragon Queen' elif i == 830: text = '^Chapter 830 Rotten and Rusty Army Part 1 ?' link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split("/")[-1] chapterlist.append(url) text = '^Chapter 830 – Rotten and Rusty Army Part 2 ?' elif i in [857, 861, 862, 863, 864, 865] or i >= 867: text = '^Chapter %s.*' % str(i) elif i == 51 or i >= 841: text = '^Chapter %s-.*' % str(i) elif i == 839: text = '^hapter 839 – The Flying Slash Part 1' link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split("/")[-1] chapterlist.append(url) return chapterlist
def select(url, filter): html = get_html(httphead(url)) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('div', {'class': 'td__category__filter__panel__item'}) dialog = xbmcgui.Dialog() color = '[COLOR FF00FF00]%s[/COLOR]' for item in soup: if filter != item.label.text.encode('utf-8'): continue si = item.find_all('li') list = [] i = 0 for subitem in si: title = subitem.text if 'current' in subitem.get('class', ''): title = '[B]{}[/B]'.format(title.encode('utf-8')) mark = i list.append(title) i += 1 sel = dialog.select(item.label.text, list) if sel >= 0: url = si[sel].a['href'] return videolist(url.encode('utf-8'))
def search(): plugin.set_content('TVShows') keyboard = xbmc.Keyboard('', '请输入搜索内容') xbmc.sleep(1500) keyboard.doModal() if not keyboard.isConfirmed(): return keyword = keyboard.getText() p_url = 'https://so.mgtv.com/so/k-' url = p_url + quote_plus(keyword) html = get_html(url) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('div', {'class': 'result-content'}) items = [] for x in soup: try: vid = x.a['video-id'] except: vid = 0 items.append({ 'label': x.img['alt'], 'path': url_for('episodelist', url=x.a['href'], id=vid, page=1), 'thumbnail': httphead(x.img['src']), }) return items
def albumlist(url): plugin.set_content('music') html = get_html(url) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('div', {'class': 'discoverAlbum_wrapper'}) albums = soup[0].find_all('div', {'class', 'discoverAlbum_item'}) for album in albums: yield { 'label': album.img['alt'], 'thumbnail': album.img['src'], 'path': url_for('playList', url=album.a['href'], page=1, order='asc') } soup = tree.find_all('div', {'class': 'pagingBar_wrapper'}) try: pages = soup[0].find_all('a') except: return for page in pages: url = page['href'] if url == 'javascript:;': continue yield { 'label': page.text, 'path': url_for('albumlist', url=httphead(url.encode('utf-8'))) }
def genlist(start, end): global origin list_page = common.get_html(origin) chapterlist = [] for i in range(start, end + 1): # print(i) text = '^Chapter %s$' % str(i) # if i in range(127, 137): # text = '^Chapter %s' % str(i) # elif i in [149, 861, 1044, 1212]: # text = '^Chapter %s-.*' % str(i) # elif i == 283: # text = '^Chapter 284 – Special Requests' # elif i == 284: # text = '^Chapter 284 – Seeing West Wonder King' # elif i == 311: # text = '^Chapter 312 – Playing the Role of A Silkpants' # elif i == 312: # text = '^Chapter 312 – Keeping Up Appearances' # elif i == 1350: # continue link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split("/")[-1] chapterlist.append(url) # print(chapterlist) chapterlist = list(dict.fromkeys(chapterlist)) return chapterlist
def serieslist(name, url): html = get_html(url) html = re.sub('\t|\n|\r| ', '', html) tree = BeautifulSoup(html, 'html.parser') soup = tree.find_all('span', {'class': 'item'}) info = tree.find('meta', {'name': 'description'})['content'] img = tree.find('meta', {'itemprop': 'image'})['content'] for item in soup: try: p_title = item.a['title'] except: continue try: href = httphead(item.a['href']) except: continue tn = item.a.text title = p_title + '--' + tn yield { 'label': title, 'path': url_for('playvideo', vid=0), 'thumbnail': img, 'info': {'title': title, 'plot': info} }
def main(url): logging.info("getting html from %s", url) html = get_html(url) logging.info("searching from keys") keys = get_keywords(html) return format_keys(keys)