Exemplo n.º 1
0
def tudou_with_youku_info(url, vidfmt):
    http = HttpUtil()
    html = http.get(url)
    vcode = re.search(r'vcode\s*[:=]\s*\'([^\']+)\'', html)
    vcode = vcode.group(1)
    url = 'http://v.youku.com/v_show/id_{0}.html'.format(vcode)
    import flvcd
    return flvcd.FLVCD().info(url, vidfmt)
Exemplo n.º 2
0
 def __init__(self, axel, proxy=None, log=None):
     ThreadBase.__init__(self, log=log)
     self.__oldurls = []
     self.__urltsks_q = Queue.Queue()
     self.__axel = axel
     self.__http = HttpUtil()
     self.__progress_bar = ProgressBar()
     if proxy:
         self.__http.set_proxy(proxy)
Exemplo n.º 3
0
 def info(self, url):
     if url.find('sohu.com') < 0:
         raise ValueError('not a sohu.com video url')
     import json
     import re
     html = HttpUtil().get(url)
     playlistid = re.findall(r'var playlistId="(?P<s>[^"]*?)";', html)[0]
     url = r'http://pl.hd.sohu.com/videolist?playlistid=%s'%playlistid
     data = json.loads(HttpUtil().get(url), encoding='gbk')
     title = data['albumName']
     items = [video['pageUrl'] for video in data['videos']]
     return title, items
Exemplo n.º 4
0
def tudou_download(url, vidfmt):
    http = HttpUtil()
    html = http.get(url)
    charset = http.parse_charset()
    html = html.decode(charset)
    iid = r1(r'"k":([^,]*),', html)
    if not iid:
        iid = r1(r'iid\s*[:=]\s*(\d+)', html)
    assert iid
    title = r1(r"kw\s*[:=]\s*['\"]([^']+)['\"]", html)
    assert title
    title = unescape_html(title)
    return tudou_download_by_iid(iid, title)
Exemplo n.º 5
0
 def info(self, url):
     if url.find('youku.com') < 0:
         raise ValueError('not a youku.com video url')
     html = HttpUtil().get(url)
     soup = BeautifulSoup(html)
     self.title = self.__title(html, soup)
     self.items = self.__items(html, soup)
     return self.title, self.items
Exemplo n.º 6
0
    def __info(self, url, vidfmt):
        parse_url = 'http://www.yytingting.com/bookstore/playAndDownload.action?' \
                    'id=%s&pageNo=%d&pageSize=%d'
        id = _util.r1('bookId=(\d+)', url)
        http = HttpUtil()
        http.add_header('Referer', url)
        tmp = parse_url % (id, 1, 20)
        info = http.get(tmp)
        js = json.loads(info)
        data = js['data']['data']
        pageNo = js['data']['pageNo']
        pageSize = js['data']['pageSize']
        total = js['data']['total']

        urls1 = []
        for i in range(total/pageSize):
            url = parse_url % (id, i+1, pageSize)
            html = http.get(url)
            js = json.loads(html)
            fmt = 'http://www.yytingting.com/resource/getPlayUrl.action?id=%d&type=6'
            urls1 = urls1 + [(data['resName'], fmt % data['resId']) for data in js['data']['data']]

        urls = []
        for name, url in urls1:
            html = http.get(url)
            js = json.loads(html)
            urls.append((name, js['data']['url']))
        return urls
Exemplo n.º 7
0
 def info(slef, url, merge=True, vidfmt=0):
     """ format_op = ["norVid", "highVid", "superVid", "oriVid"] """
     assert vidfmt in (0, 1, 2, 3)
     http = HttpUtil()
     vid_page = http.get(url)
     vid = r1('vid="(\d+)"', vid_page)
     if not vid:
         vid = r1('vid:\s*\'(\d+)\'', vid_page)
     assert vid
     import json
     html = http.get('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid)
     data = json.loads(html.decode(http.parse_charset()))
     if vidfmt > 0:
         format_op = ["norVid", "highVid", "superVid", "oriVid"]
         vid = data['data'][format_op[vidfmt]]
         html = http.get('http://hot.vrs.sohu.com/vrs_flash.action?vid=%s' % vid)
         data = json.loads(html.decode(http.parse_charset()))
     host = data['allot']
     prot = data['prot']
     urls = []
     data = data['data']
     title = data['tvName']
     size = sum(data['clipsBytes'])
     assert len(data['clipsURL']) == len(data['clipsBytes']) == len(data['su'])
     for file, new in zip(data['clipsURL'], data['su']):
         urls.append(real_url(host, prot, file, new))
     assert data['clipsURL'][0].endswith('.mp4')
     return urls, title, 'mp4', 5, None
Exemplo n.º 8
0
 def __get_content_len(self, url):
     http = HttpUtil()
     if self.proxy:
         http.set_proxy(self.proxy)
     info = http.head(url)
     if 200 <= info.status < 300:
         if info.msg.dict.has_key('Content-Length'):
             return int(info.getheader('Content-Length'))
     try:
         resp = http.get_response(url)
     except urllib2.URLError as e:
         self.log.warn('%s \n %s', e.reason, url)
         return 0
     if 200 <= resp.code < 300:
         # assert resp.has_header('Accept-Ranges')
         length = int(resp.headers.get('Content-Length'))
         resp.close()
         return length
Exemplo n.º 9
0
 def __init__(self, axel, proxy=None, log=None):
     ThreadBase.__init__(self, log=log)
     self.__oldurls = []
     self.__urltsks_q = Queue.Queue()
     self.__axel = axel
     self.__http = HttpUtil()
     self.__progress_bar = ProgressBar()
     if proxy:
         self.__http.set_proxy(proxy)
Exemplo n.º 10
0
 def __init__(self):
     self.http = HttpUtil(charset="utf-8")
     self.http.header_refer_ = "http://v.ifeng.com/include/ifengLivePlayer_v1.40.4.swf"
     self.http.header_user_agent_ = r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
     self.http.add_header("x-flash-version", "11,5,502,146")
     self.http.add_header("Accept-Language", "zh-CN")
     self.http.add_header("Accept", "*/*")
     self.http.add_header("Proxy-Connection", "Keep-Alive")
     self.uuid = ""
     self.flv_location = ""
     self.schedule_json = None
     self.channels = {}
     self.down_handle = None
Exemplo n.º 11
0
 def info(self, url, vidfmt):
     parse_url = 'http://www.flvcd.com/parse.php?'
     parse_url += 'kw=' + quote(url)
     parse_url += '&flag=one'
     format = ['', 'high', 'super', 'real']
     if vidfmt > 0:
         parse_url += '&format=%s' % format[vidfmt]
     parse_url += "&Go=1&go=1"  # 20150723
     http = HttpUtil()
     http.add_header('Referer', parse_url)
     print parse_url
     try:
         html = http.get(parse_url).decode('gb2312', 'ignore')
         from bs4 import BeautifulSoup
         soup = BeautifulSoup(html)
         m3u = soup.find('input', attrs={'name': 'inf'}).get('value')
         title = soup.find('input', attrs={'name': 'name'}).get('value')
     except Exception as e:
         # raise ValueError('not support')
         return [], '', None, 0, None
     urls = [u for u in m3u.split('|')]
     npf, headers = host_filter(url)
     return urls, title, None, npf, headers
Exemplo n.º 12
0
 def info(self, url, vidfmt):
     parse_url = 'http://www.flvcd.com/parse.php?'
     parse_url += 'kw='+ quote(url)
     parse_url += '&flag=one'
     format = ['', 'high', 'super', 'real']
     if vidfmt > 0:
         parse_url += '&format=%s'%format[vidfmt]
     parse_url += "&Go=1&go=1"  # 20150723
     http = HttpUtil()
     http.add_header('Referer', parse_url)
     print parse_url
     try:
         html = http.get(parse_url).decode('gb2312', 'ignore')
         from bs4 import BeautifulSoup
         soup = BeautifulSoup(html)
         m3u = soup.find('input', attrs={'name': 'inf'}).get('value')
         title = soup.find('input', attrs={'name': 'name'}).get('value')
     except Exception as e:
         # raise ValueError('not support')
         return [], '', None, 0, None
     urls = [u for u in m3u.split('|')]
     npf, headers = host_filter(url)
     return urls, title, None, npf, headers
Exemplo n.º 13
0
def w56_download_by_id(id, refer, vidfmt=0, merge=True):
    html = HttpUtil().get('http://vxml.56.com/json/%s/?src=site' % id)
    info = json.loads(html)['info']
    title = info['Subject']
    # assert title
    # hd = info['hd']
    # assert hd in (0, 1, 2)
    # type = ['normal', 'clear', 'super'][hd]
    assert vidfmt in (0, 1, 2)
    type = ['normal', 'clear', 'super'][vidfmt]
    files = [x for x in info['rfiles'] if x['type'] == type]
    assert len(files) == 1
    size = int(files[0]['filesize'])
    url = files[0]['url']
    ext = r1(r'\.([^.]+)\?', url)
    assert ext in ('flv', 'mp4')
    return [url], title, str(ext), 1, None
Exemplo n.º 14
0
 def __get_content_len(self, url):
     http = HttpUtil()
     if self.proxy:
         http.set_proxy(self.proxy)
     info = http.head(url)
     if 200 <= info.status < 300:
         if info.msg.dict.has_key('Content-Length'):
             return int(info.getheader('Content-Length'))
     try:
         resp = http.get_response(url)
     except urllib2.URLError as e:
         self.log.warn('%s \n %s', e.reason, url)
         return 0
     if 200 <= resp.code < 300:
         # assert resp.has_header('Accept-Ranges')
         length = int(resp.headers.get('Content-Length'))
         resp.close()
         return length
Exemplo n.º 15
0
class M3u8Stream(ThreadBase):
    def __init__(self, axel, proxy=None, log=None):
        ThreadBase.__init__(self, log=log)
        self.__oldurls = []
        self.__urltsks_q = Queue.Queue()
        self.__axel = axel
        self.__http = HttpUtil()
        self.__progress_bar = ProgressBar()
        if proxy:
            self.__http.set_proxy(proxy)

    def recode(self, url, duration, vfmt, fp, npf, freq=10, detach=False):
        """ @param npf: download url stream by n parts per file
            @param vfmt: live video format """
        self.m3u8url = url
        self.duration = duration
        self.vfmt = int(vfmt) # TODO: ugly conversion
        self.__ostream = fp
        self.__npf = npf
        self.__freq = freq
        if detach:
            self.start()
        else:
            self.run()

    def run(self):
        try:
            self.__loop()
        except:
            raise
        finally:
            while not self.__urltsks_q.empty():
                self.__urltsks_q.get().cleanup()
            self.log.debug('[M3u8Stream] stop')

    def __loop(self):
        last_clip_at = 0
        buff_stream_len = 0
        targetduration = 2
        start_at = time.time()
        stop_at = 0
        if self.duration:
            stop_at = start_at + self.duration

        curr_tsk = None
        while not self.isSetStop():
            start_at = time.time()
            self.__progress_bar.display()

            if self.duration and start_at >= stop_at:
                self.log.info("[DownloadLiveStream] time's up")
                return

            # get index page every 10s
            if last_clip_at + self.__freq < start_at:
                urls, targetduration = self.__get_curr_m3u8_file(self.m3u8url)
                for url in urls:
                    if url not in self.__oldurls:
                        memfile = BytesIO()
                        memfile.read = memfile.getvalue
                        urltask = UrlTask(url, out=memfile, npf=self.__npf,
                                          bar=self.__progress_bar, log=self.log)
                        self.__oldurls.append(url)
                        self.__axel.addTask(urltask)
                        self.__urltsks_q.put(urltask)
                if len(self.__oldurls) > 100:
                    self.__oldurls = self.__oldurls[-20:]
                last_clip_at = start_at

            # append to stream; handle error; get a new clip
            if curr_tsk:
                if curr_tsk.isArchived():
                    self.log.debug('[M3u8Stream] merge clip, %s', curr_tsk.url)
                    self.__ostream.write(curr_tsk.out.read())
                    curr_tsk.out.close()
                    curr_tsk.cleanup()
                    curr_tsk = None
                    buff_stream_len += targetduration
                elif curr_tsk.isError():
                    self.log.error('[M3u8Stream] error: %s', curr_tsk.url)
                    curr_tsk.cleanup()
                    raise
            elif not self.__urltsks_q.empty():
                curr_tsk = self.__urltsks_q.get()

            if time.time() - start_at < 1:
                sleep(1)

    def __get_curr_m3u8_file(self, m3u8url, n=3):
        urls = []
        sub_m3u8s = []
        targetduration = 0
        try:
            m3u8 = self.__http.get(m3u8url)
            for line in m3u8.splitlines(False):
                line = line.strip(' \n')
                if line == '':
                    continue
                if line.startswith('#'):
                    if line.lower().find('targetduration') > 0:
                        targetduration = int(line.split(':')[1])
                        self.log.debug('[M3u8Stream] targetduration=%d', targetduration)
                else:
                    if line.startswith('http'):
                        urls.append(line)
                    else:
                        url = urllib.basejoin(M3u8Stream.host_filter(m3u8url), line)
                        if line.endswith('.m3u8'):
                            sub_m3u8s.append(url)
                        else:
                            urls.append(url)

            sm_len = len(sub_m3u8s)
            if sm_len > 0:
                fmt_index = self.vfmt if self.vfmt < sm_len else sm_len-1
                self.log.debug('[M3u8Stream] use sub m3u8 url: %s', sub_m3u8s[fmt_index])
                return self.__get_curr_m3u8_file(sub_m3u8s[fmt_index])
        except urllib2.URLError as e:
            self.log.warn('[M3u8Stream] network not working: %s', e.message)
        except _socket_timeout:
            self.log.warn('[M3u8Stream] connection timeout')
        except:
            raise
        return urls, targetduration

    @staticmethod
    def host_filter(url):
        if url.find('ifeng.com') > 0:
            return re.match('(^http[s]?://[^/?]*/)', url).group(0)
        else:
            return re.match('(^http[s]?://.*/)', url).group(0)


# if __name__ == "__main__":
#     main()
Exemplo n.º 16
0
class Spider:

    def __init__(self):
        self.http = HttpUtil(charset="utf-8")
        self.http.header_refer_ = "http://v.ifeng.com/include/ifengLivePlayer_v1.40.4.swf"
        self.http.header_user_agent_ = r"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"
        self.http.add_header("x-flash-version", "11,5,502,146")
        self.http.add_header("Accept-Language", "zh-CN")
        self.http.add_header("Accept", "*/*")
        self.http.add_header("Proxy-Connection", "Keep-Alive")
        self.uuid = ""
        self.flv_location = ""
        self.schedule_json = None
        self.channels = {}
        self.down_handle = None


    def start_recode(self, channel_name, duration, output='./'):
        output = os.path.abspath(output)
        if not os.path.isdir(output):
            os.mkdir(output)
        outfile = os.path.join(output, util.get_time_string() + ".flv")
        LOG.info("[channel] %s", channel_name)
        uuid = self._get_uuid(channel_name)
        flv_location = self._get_flv_location(uuid)
        LOG.info("[location] %s", flv_location)
        LOG.info("[output] %s", outfile)
        LOG.info("[start.... ] %s", util.get_time_string())
        self.download_handle = DownloadStreamHandler(open(outfile,"w"), duration)
        self.http.fetch(flv_location, self.download_handle)
        LOG.info("[stop..... ] %s", util.get_time_string())

    def get_channel_info(self):
        data = self.http.get(r'http://v.ifeng.com/live/js/scheduleurls.js?37')
        tmp = util.reg_helper(data,r'g_scheduelUrl\s=\s(?P<f>.*)}')[0] + '}'
        tmp = tmp.replace("\'","\"").decode(encoding="utf-8")
        js = json.loads(s=tmp, encoding="utf-8")
        for uuid, channel in js.items():
            name = channel['name']
            self.channels[name] = {'uuid': uuid, 'url': channel['url']}
        self.schedule_json = tmp
        return self.channels, self.schedule_json

    def _get_uuid(self,channel_name):
        self.get_channel_info()
        url = self.channels[channel_name]['url']
        data = self.http.get(url)
        html = data.decode(CHARSET)
        if html.find(r'uuid=') > 0:
            reg_str = r'uuid=(?P<f>[^|]*)'
        else:
            reg_str = r'http://biz.vsdn.tv380.com/playlive.php\?(?P<f>[^|]*)'
        self.uuid = util.reg_helper(html,reg_str)[0]
        LOG.info("[UUID] %s", self.uuid)
        return self.uuid

    def _get_param(self, uuid):
        time_string = str(int(time.time() + 300))
        hash_string = "ifeng" + "7171537bdc0b95c6a23d9e21ea6615ebet720se2zjw" + time_string + uuid + "1" + "ifenuserid="
        hash_result = hashlib.md5(hash_string).hexdigest()
        param = uuid + "&swax_rt=js&ifenai=ifeng&ifenfg=&ifents=" + time_string + "&ifenv=1&ifensg="\
                + hash_result[5:15] + "&ifenuserid="
        return param

    def _get_flv_location(self, uuid):
        param = self._get_param(uuid)
        url = r'http://ifenglive.soooner.com/?uuid=%s' % (param)
        data = self.http.get(url)
        html = data.decode(CHARSET)
        reg_str = r'playurl="(?P<f>[^"]*)"'
        self.flv_location = util.reg_helper(html,reg_str)[0]
        self.flv_location = url.replace("rtmp://", "http://")
        LOG.info("[flv] %s", self.flv_location)
        data = self.http.get(self.flv_location)
        html = data.decode(CHARSET)
        reg_str = r'playurl="(?P<f>[^"]*)"'
        self.flv_location = util.reg_helper(html, reg_str)[0]
        self.flv_location = self.flv_location.replace("rtmp://", "http://")
        return self.flv_location
Exemplo n.º 17
0
def real_url(host, prot, file, new):
    url = 'http://%s/?prot=%s&file=%s&new=%s' % (host, prot, file, new)
    html = HttpUtil().get(url)
    start, _, host, key, _, _, _, _, _ = html.split('|')
    return '%s%s?key=%s' % (start[:-1], new, key)
Exemplo n.º 18
0
class M3u8Stream(ThreadBase):
    def __init__(self, axel, proxy=None, log=None):
        ThreadBase.__init__(self, log=log)
        self.__oldurls = []
        self.__urltsks_q = Queue.Queue()
        self.__axel = axel
        self.__http = HttpUtil()
        self.__progress_bar = ProgressBar()
        if proxy:
            self.__http.set_proxy(proxy)

    def recode(self, url, duration, vfmt, fp, npf, freq=10, detach=False):
        """ @param npf: download url stream by n parts per file
            @param vfmt: live video format """
        self.m3u8url = url
        self.duration = duration
        self.vfmt = int(vfmt)  # TODO: ugly conversion
        self.__ostream = fp
        self.__npf = npf
        self.__freq = freq
        if detach:
            self.start()
        else:
            self.run()

    def run(self):
        try:
            self.__loop()
        except:
            raise
        finally:
            while not self.__urltsks_q.empty():
                self.__urltsks_q.get().cleanup()
            self.log.debug('[M3u8Stream] stop')

    def __loop(self):
        last_clip_at = 0
        buff_stream_len = 0
        targetduration = 2
        start_at = time.time()
        stop_at = 0
        if self.duration:
            stop_at = start_at + self.duration

        curr_tsk = None
        while not self.isSetStop():
            start_at = time.time()
            self.__progress_bar.display()

            if self.duration and start_at >= stop_at:
                self.log.info("[DownloadLiveStream] time's up")
                return

            # get index page every 10s
            if last_clip_at + self.__freq < start_at:
                urls, targetduration = self.__get_curr_m3u8_file(self.m3u8url)
                for url in urls:
                    if url not in self.__oldurls:
                        memfile = BytesIO()
                        memfile.read = memfile.getvalue
                        urltask = UrlTask(url,
                                          out=memfile,
                                          npf=self.__npf,
                                          bar=self.__progress_bar,
                                          log=self.log)
                        self.__oldurls.append(url)
                        self.__axel.addTask(urltask)
                        self.__urltsks_q.put(urltask)
                if len(self.__oldurls) > 100:
                    self.__oldurls = self.__oldurls[-20:]
                last_clip_at = start_at

            # append to stream; handle error; get a new clip
            if curr_tsk:
                if curr_tsk.isArchived():
                    self.log.debug('[M3u8Stream] merge clip, %s', curr_tsk.url)
                    self.__ostream.write(curr_tsk.out.read())
                    curr_tsk.out.close()
                    curr_tsk.cleanup()
                    curr_tsk = None
                    buff_stream_len += targetduration
                elif curr_tsk.isError():
                    self.log.error('[M3u8Stream] error: %s', curr_tsk.url)
                    curr_tsk.cleanup()
                    raise
            elif not self.__urltsks_q.empty():
                curr_tsk = self.__urltsks_q.get()

            if time.time() - start_at < 1:
                sleep(1)

    def __get_curr_m3u8_file(self, m3u8url, n=3):
        urls = []
        sub_m3u8s = []
        targetduration = 0
        try:
            m3u8 = self.__http.get(m3u8url)
            for line in m3u8.splitlines(False):
                line = line.strip(' \n')
                if line == '':
                    continue
                if line.startswith('#'):
                    if line.lower().find('targetduration') > 0:
                        targetduration = int(line.split(':')[1])
                        self.log.debug('[M3u8Stream] targetduration=%d',
                                       targetduration)
                else:
                    if line.startswith('http'):
                        urls.append(line)
                    else:
                        url = urllib.basejoin(M3u8Stream.host_filter(m3u8url),
                                              line)
                        if line.endswith('.m3u8'):
                            sub_m3u8s.append(url)
                        else:
                            urls.append(url)

            sm_len = len(sub_m3u8s)
            if sm_len > 0:
                fmt_index = self.vfmt if self.vfmt < sm_len else sm_len - 1
                self.log.debug('[M3u8Stream] use sub m3u8 url: %s',
                               sub_m3u8s[fmt_index])
                return self.__get_curr_m3u8_file(sub_m3u8s[fmt_index])
        except urllib2.URLError as e:
            self.log.warn('[M3u8Stream] network not working: %s', e.message)
        except _socket_timeout:
            self.log.warn('[M3u8Stream] connection timeout')
        except:
            raise
        return urls, targetduration

    @staticmethod
    def host_filter(url):
        if url.find('ifeng.com') > 0:
            return re.match('(^http[s]?://[^/?]*/)', url).group(0)
        else:
            return re.match('(^http[s]?://.*/)', url).group(0)


# if __name__ == "__main__":
#     main()