def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=tumblr.com&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
def search_video(self, keyword, page_num, num_per_page): start = (page_num-1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=tumblr.com&tn=baiduadv&pn=%d&rn=%d" % (keyword, start, num_per_page) r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath('//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({"title": title, "vid": vid_link, "img": img_link, "desc": desc}) return results
def get_link(self, url): r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//iframe[@src]') if len(links) == 0: raise VideoNotFound(url) vid_link = links[0].get("src") r = requests.get(vid_link, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//source[@src]') if len(links) == 0: raise VideoNotFound(url) vid_link = links[0].get("src") vid_link = get_orig_url(vid_link) img_links = tree.xpath('//video[@poster]') img_link = '' if len(img_links) > 0: img_link = img_links[0].get("poster") desc = "" return {"vid": vid_link, "img": img_link, "desc": desc}
class Xiaokaxiu(Site): def __init__(self): pass @timeit def get_link(self, url): parsed = urlparse.urlsplit(url) patt = re.compile(r"/v/(.*).html") match = patt.search(parsed.path) if not match: raise VideoNotFound() scid = match.group(1) url = 'http://api.xiaokaxiu.com/video/web/get_play_video?scid=%s' % ( scid) r = requests.get(url, timeout=10) result = r.text data = json.loads(result) img_link = data["data"]["cover"] vid_link = data["data"]["linkurl"] desc = '' return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=xiaokaxiu.com&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
class Meipai(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//div[@id="detailVideo"]/@data-video') if len(links) == 0: raise VideoNotFound(url) vid_link = links[0] img_links = tree.xpath('//div[@id="detailVideo"]/img/@src') img_link = '' if len(img_links) > 0: img_link = img_links[0] descs = tree.xpath('//h1[@class="detail-description break"]/text()') desc = " ".join(descs) return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num-1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=meipai.com&tn=baiduadv&pn=%d&rn=%d" % (keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath('//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({"title": title, "vid": vid_link, "img": img_link, "desc": desc}) return results
def get_link(self, url): r = requests.get(url, timeout=10) result = r.text patt = re.compile(r'player_src=([^"]*)"') match = patt.search(result) if not match: raise VideoNotFound(url) src = urllib.unquote(match.group(1)) vid_link = get_orig_url(src) patt = re.compile(r'player_poster=([^&]*)&') match = patt.search(result) img_link = '' if match: img_link = urllib.unquote(match.group(1)) parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) descs = tree.xpath('//div[@class="detail_des"]') desc = "" if len(descs) > 0: desc = descs[0].text return {"vid": vid_link, "img": img_link, "desc": desc}
class Vlook(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=10) result = r.text patt = re.compile(r'player_src=([^"]*)"') match = patt.search(result) if not match: raise VideoNotFound(url) src = urllib.unquote(match.group(1)) vid_link = get_orig_url(src) patt = re.compile(r'player_poster=([^&]*)&') match = patt.search(result) img_link = '' if match: img_link = urllib.unquote(match.group(1)) parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) descs = tree.xpath('//div[@class="detail_des"]') desc = "" if len(descs) > 0: desc = descs[0].text return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num-1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=vlook.cn&tn=baiduadv&pn=%d&rn=%d" % (keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath('//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({"title": title, "vid": vid_link, "img": img_link, "desc": desc}) return results
class Weipainv(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=10) result = r.text #print result.encode("utf-8") patt = re.compile(r'jwplayer\(".*"\).setup\(\{(.*?)\}\)', re.M | re.S) match = patt.search(result) if not match: raise VideoNotFound() try: content = match.group(1) patt = re.compile(r'file: "(.*?)".*image: "(.*?)"', re.M | re.S) link = patt.search(content) vid_link = link.group(1) img_link = link.group(2) desc = '' return {"vid": vid_link, "img": img_link, "desc": desc} except: raise VideoNotFound() @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=weipainv.com&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
class Xiaoying(Site): def __init__(self): pass @timeit def get_link(self, url): parsed = urlparse.urlsplit(url) patt = re.compile(r"/v/([^/]*)/") match = patt.search(parsed.path) if not match: raise VideoNotFound() puid = match.group(1) img_url = 'http://w.api.xiaoying.co/webapi2/rest/video/publishinfo.get?callback=videocallbackinfo&appkey=30000000&puid=%s' % (puid) vid_url = 'http://w.api.xiaoying.co/webapi2/rest/video/videourl?callback=videocallbackvideosrc&appkey=30000000&puid=%s' % (puid) r = requests.get(img_url, timeout=10) result = r.text patt = re.compile(r"\((\{.*\})\)") match = patt.search(result) img_link = "" if match: data = json.loads(match.group(1)) img_link = data["videoinfo"]["coverurl"] r = requests.get(vid_url, timeout=10) result = r.text match = patt.search(result) if not match: raise VideoNotFound() data = json.loads(match.group(1)) vid_link = data["url"] desc = "" return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num-1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=xiaoying.tv&tn=baiduadv&pn=%d&rn=%d" % (keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath('//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({"title": title, "vid": vid_link, "img": img_link, "desc": desc}) return results
class Aishipin(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=20) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//div[@id="post_content"]') if len(links) == 0: raise VideoNotFound(url) vid_node = links[0] html = etree.tostring(vid_node) patt = re.compile(r"setCuSunPlayerVideo\((.*)\)") match = patt.search(html) if not match: return self.get_sinaimg_video(tree) params = match.group(1).split(",") img_path = params[2] img_path = img_path[1:len(img_path) - 1] img_link = "http://www.aishipin.net" + img_path vid_link = params[3] vid_link = vid_link[1:len(vid_link) - 1] p_nodes = vid_node.findall('.//p[@style]') desc = get_inner_html(p_nodes[1]) return {"vid": vid_link, "img": img_link, "desc": desc} def get_sinaimg_video(self, tree): links = tree.xpath('//source[@src]') if len(links) == 0: raise VideoNotFound(url) vid_link = links[0].get("src") img_link = "" desc = "" return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=aishipin.net&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
class Weibo(Site): def __init__(self): pass @timeit def get_link(self, url): parsed = urlparse.urlsplit(url) netloc = parsed.netloc sinaimg_url = "" img_link = "" if re.search(r"weibo.com|weibo.cn", netloc): r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//embed/@flashvars') if len(links) == 0: raise VideoNotFound() flashvars = links[0] patt = re.compile(r'list=(.*)') match = patt.search(flashvars) if not match: raise VideoNotFound() sinaimg_url = urllib.unquote(match.group(1)) img_links = tree.xpath('//img/@src') if len(img_links) > 0: img_link = img_links[0] elif re.search(r"sinajs.cn", netloc): patt = re.compile(r"file=(.*)") match = patt.search(url) sinaimg_url = urllib.unquote(match.group(1)) elif re.search(r"sinaimg.cn", netloc): sinaimg_url = url path = self.extract_mp4(sinaimg_url) netloc = urlparse.urlsplit(sinaimg_url).netloc vid_link = "http://%s/%s" % (netloc, path) return {"vid": vid_link, "img": img_link, "desc": ""} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=video.weibo.com&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
class Weipai(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath( '//div[@class="video_player"]/a[@class="play"]/@onclick') #result = etree.tostring(tree.getroot(), # pretty_print=True, method="html") #print result if len(links) == 0: raise VideoNotFound(url) link = links[0] patt = re.compile(r"playVideo\('(.*?)'") match = patt.search(link) if not match: raise VideoNotFound(url) vid = match.group(1) share_link = 'http://share.weipai.cn/video/play/id/%s/type/theater/source/undefine' % ( vid) r = requests.get(share_link, timeout=10) result = r.text patt = re.compile(r"'(http.*?)'") match_url = patt.search(result) if not match_url: raise VideoNotFound(url) wrapper_url = match_url.group(1) wrapper_params = urlparse.urlsplit(wrapper_url) codes = urlparse.parse_qs(wrapper_params.query)['s'] if len(codes) == 0: raise VideoNotFound() code = codes[0] vid_params = base64.b64decode(code) links = urlparse.parse_qs(vid_params)['p'] if len(links) == 0: raise VideoNotFound() vid_link = links[0] img_links = tree.xpath( '//div[@class="video_player"]/div/span/img/@src') img_link = 0 if len(img_links) > 0: img_link = img_links[0] return {"vid": vid_link, "img": img_link, "desc": ""} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=weipai.cn&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=5) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results
class Miaopai(Site): def __init__(self): pass @timeit def get_link(self, url): r = requests.get(url, timeout=10) result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) links = tree.xpath('//param[@name="src"]/@value') if len(links) == 0: return self.__search_mp4(tree) link = links[0] patt = re.compile(r"\?scid=(.*?)&") match = patt.search(link) if not match: raise VideoNotFound(url) scid = match.group(1) vid_link = "http://gslb.miaopai.com/stream/%s.mp4" % (scid) img_links = tree.xpath('//div[@class="video_img"]/img/@src') img_link = '' if len(img_links) > 0: img_link = img_links[0] descs = tree.xpath('//div[@class="introduction"]/p') desc = '' if len(descs) > 0: desc = descs[0].text return {"vid": vid_link, "img": img_link, "desc": desc} @timeit def search_video(self, keyword, page_num, num_per_page): start = (page_num - 1) * num_per_page url = "http://www.baidu.com/s?q1=%s&q2=&q3=&q4=&lm=0&ft=&q5=&q6=miaopai.com&tn=baiduadv&pn=%d&rn=%d" % ( keyword, start, num_per_page) try: r = requests.get(url, timeout=10) except Exception, e: return [] result = r.text parser = etree.HTMLParser() tree = etree.parse(StringIO(result), parser) divs = tree.xpath( '//div[@id="content_left"]/div[@class="result c-container "]') results = [] for div in divs: a_node = div.find('.//h3/a') title = get_inner_html(a_node) vid_link = get_orig_url(a_node.get('href')) img_node = div.find('.//div/div/a/img') img_link = "" if img_node is not None: img_link = img_node.get('src') descs = div.iterfind('.//div[@class="c-abstract"]') desc = "" try: desc_elem = descs.next() desc = get_inner_html(desc_elem) except: pass results.append({ "title": title, "vid": vid_link, "img": img_link, "desc": desc }) return results