def __info(self, url, vidfmt): parse_url = 'http://www.yytingting.com/bookstore/playAndDownload.action?' \ 'id=%s&pageNo=%d&pageSize=%d' id = _util.r1('bookId=(\d+)', url) http = HttpUtil() http.add_header('Referer', url) tmp = parse_url % (id, 1, 20) info = http.get(tmp) js = json.loads(info) data = js['data']['data'] pageNo = js['data']['pageNo'] pageSize = js['data']['pageSize'] total = js['data']['total'] urls1 = [] for i in range(total/pageSize): url = parse_url % (id, i+1, pageSize) html = http.get(url) js = json.loads(html) fmt = 'http://www.yytingting.com/resource/getPlayUrl.action?id=%d&type=6' urls1 = urls1 + [(data['resName'], fmt % data['resId']) for data in js['data']['data']] urls = [] for name, url in urls1: html = http.get(url) js = json.loads(html) urls.append((name, js['data']['url'])) return urls
def info(self, url, vidfmt): parse_url = 'http://www.flvcd.com/parse.php?' parse_url += 'kw=' + quote(url) parse_url += '&flag=one' format = ['', 'high', 'super', 'real'] if vidfmt > 0: parse_url += '&format=%s' % format[vidfmt] parse_url += "&Go=1&go=1" # 20150723 http = HttpUtil() http.add_header('Referer', parse_url) print parse_url try: html = http.get(parse_url).decode('gb2312', 'ignore') from bs4 import BeautifulSoup soup = BeautifulSoup(html) m3u = soup.find('input', attrs={'name': 'inf'}).get('value') title = soup.find('input', attrs={'name': 'name'}).get('value') except Exception as e: # raise ValueError('not support') return [], '', None, 0, None urls = [u for u in m3u.split('|')] npf, headers = host_filter(url) return urls, title, None, npf, headers
def info(self, url, vidfmt): parse_url = 'http://www.flvcd.com/parse.php?' parse_url += 'kw='+ quote(url) parse_url += '&flag=one' format = ['', 'high', 'super', 'real'] if vidfmt > 0: parse_url += '&format=%s'%format[vidfmt] parse_url += "&Go=1&go=1" # 20150723 http = HttpUtil() http.add_header('Referer', parse_url) print parse_url try: html = http.get(parse_url).decode('gb2312', 'ignore') from bs4 import BeautifulSoup soup = BeautifulSoup(html) m3u = soup.find('input', attrs={'name': 'inf'}).get('value') title = soup.find('input', attrs={'name': 'name'}).get('value') except Exception as e: # raise ValueError('not support') return [], '', None, 0, None urls = [u for u in m3u.split('|')] npf, headers = host_filter(url) return urls, title, None, npf, headers
class Spider: def __init__(self): self.http = HttpUtil(charset="utf-8") self.http.header_refer_ = "http://v.ifeng.com/include/ifengLivePlayer_v1.40.4.swf" self.http.header_user_agent_ = r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)" self.http.add_header("x-flash-version", "11,5,502,146") self.http.add_header("Accept-Language", "zh-CN") self.http.add_header("Accept", "*/*") self.http.add_header("Proxy-Connection", "Keep-Alive") self.uuid = "" self.flv_location = "" self.schedule_json = None self.channels = {} self.down_handle = None def start_recode(self, channel_name, duration, output='./'): output = os.path.abspath(output) if not os.path.isdir(output): os.mkdir(output) outfile = os.path.join(output, util.get_time_string() + ".flv") LOG.info("[channel] %s", channel_name) uuid = self._get_uuid(channel_name) flv_location = self._get_flv_location(uuid) LOG.info("[location] %s", flv_location) LOG.info("[output] %s", outfile) LOG.info("[start.... ] %s", util.get_time_string()) self.download_handle = DownloadStreamHandler(open(outfile,"w"), duration) self.http.fetch(flv_location, self.download_handle) LOG.info("[stop..... ] %s", util.get_time_string()) def get_channel_info(self): data = self.http.get(r'http://v.ifeng.com/live/js/scheduleurls.js?37') tmp = util.reg_helper(data,r'g_scheduelUrl\s=\s(?P<f>.*)}')[0] + '}' tmp = tmp.replace("\'","\"").decode(encoding="utf-8") js = json.loads(s=tmp, encoding="utf-8") for uuid, channel in js.items(): name = channel['name'] self.channels[name] = {'uuid': uuid, 'url': channel['url']} self.schedule_json = tmp return self.channels, self.schedule_json def _get_uuid(self,channel_name): self.get_channel_info() url = self.channels[channel_name]['url'] data = self.http.get(url) html = data.decode(CHARSET) if html.find(r'uuid=') > 0: reg_str = r'uuid=(?P<f>[^|]*)' else: reg_str = r'http://biz.vsdn.tv380.com/playlive.php\?(?P<f>[^|]*)' self.uuid = util.reg_helper(html,reg_str)[0] LOG.info("[UUID] %s", self.uuid) return self.uuid def _get_param(self, uuid): time_string = str(int(time.time() + 300)) hash_string = "ifeng" + "7171537bdc0b95c6a23d9e21ea6615ebet720se2zjw" + time_string + uuid + "1" + "ifenuserid=" hash_result = hashlib.md5(hash_string).hexdigest() param = uuid + "&swax_rt=js&ifenai=ifeng&ifenfg=&ifents=" + time_string + "&ifenv=1&ifensg="\ + hash_result[5:15] + "&ifenuserid=" return param def _get_flv_location(self, uuid): param = self._get_param(uuid) url = r'http://ifenglive.soooner.com/?uuid=%s' % (param) data = self.http.get(url) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html,reg_str)[0] self.flv_location = url.replace("rtmp://", "http://") LOG.info("[flv] %s", self.flv_location) data = self.http.get(self.flv_location) html = data.decode(CHARSET) reg_str = r'playurl="(?P<f>[^"]*)"' self.flv_location = util.reg_helper(html, reg_str)[0] self.flv_location = self.flv_location.replace("rtmp://", "http://") return self.flv_location