예제 #1
0
    def parse_links(self):
        """Parse the video links from the page source, extracts and
        returns the /watch?v= part from video link href
        It's an alternative for BeautifulSoup
        """

        url = self.construct_playlist_url()
        req = request.get(url)

        # split the page source by line and process each line
        content = [x for x in req.split('\n') if 'pl-video-title-link' in x]
        link_list = [x.split('href="', 1)[1].split('&', 1)[0] for x in content]

        # The above only returns 100 or fewer links
        # Simulating a browser request for the load more link
        load_more_url = self._load_more_url(req)
        while len(load_more_url):  # there is an url found
            logger.debug('load more url: %s' % load_more_url)
            req = request.get(load_more_url)
            load_more = json.loads(req)
            videos = re.findall(
                r'href=\"(/watch\?v=[\w-]*)',
                load_more['content_html'],
            )
            # remove duplicates
            link_list.extend(list(OrderedDict.fromkeys(videos)))
            load_more_url = self._load_more_url(
                load_more['load_more_widget_html'], )

        return link_list
예제 #2
0
    def prefetch(self):
        """Eagerly download all necessary data.

        Eagerly executes all necessary network requests so all other
        operations don't does need to make calls outside of the interpreter
        which blocks for long periods of time.

        :rtype: None

        """
        self.watch_html = request.get(url=self.watch_url)
        if '<img class="icon meh" src="/yts/img' not in self.watch_html:
            raise VideoUnavailable('This video is unavailable.')
        self.embed_html = request.get(url=self.embed_url)
        self.age_restricted = extract.is_age_restricted(self.watch_html)
        self.vid_info_url = extract.video_info_url(
            video_id=self.video_id,
            watch_url=self.watch_url,
            watch_html=self.watch_html,
            embed_html=self.embed_html,
            age_restricted=self.age_restricted,
        )
        self.vid_info = request.get(self.vid_info_url)
        if not self.age_restricted:
            self.js_url = extract.js_url(self.watch_html, self.age_restricted)
            self.js = request.get(self.js_url)
예제 #3
0
    def filesize(self):
        """File size of the media stream in bytes.

        :rtype: int
        :returns:
            Filesize (in bytes) of the stream.
        """
        if self._filesize is None:
            headers = request.get(self.url, headers=True)
            self._filesize = int(headers['content-length'])
        return self._filesize
예제 #4
0
    def download(self, output_path=None, filename=None, filename_prefix=None):
        """Write the media stream to disk.

        :param output_path:
            (optional) Output path for writing media file. If one is not
            specified, defaults to the current working directory.
        :type output_path: str or None
        :param filename:
            (optional) Output filename (stem only) for writing media file.
            If one is not specified, the default filename is used.
        :type filename: str or None
        :param filename_prefix:
            (optional) A string that will be prepended to the filename.
            For example a number in a playlist or the name of a series.
            If one is not specified, nothing will be prepended
            This is seperate from filename so you can use the default
            filename but still add a prefix.
        :type filename_prefix: str or None

        :rtype: str

        """
        output_path = output_path or os.getcwd()
        if filename:
            safe = safe_filename(filename)
            filename = '{filename}.{s.subtype}'.format(filename=safe, s=self)
        filename = filename or self.default_filename

        if filename_prefix:
            filename = '{prefix}{filename}'\
                .format(
                    prefix=safe_filename(filename_prefix),
                    filename=filename,
                )

        # file path
        fp = os.path.join(output_path, filename)
        bytes_remaining = self.filesize
        logger.debug(
            'downloading (%s total bytes) file to %s',
            self.filesize,
            fp,
        )

        with open(fp, 'wb') as fh:
            for chunk in request.get(self.url, streaming=True):
                # reduce the (bytes) remainder by the length of the chunk.
                bytes_remaining -= len(chunk)
                # send to the on_progress callback.
                self.on_progress(chunk, fh, bytes_remaining)
        self.on_complete(fh)
        return fp
예제 #5
0
    def title(self):
        """return playlist title (name)
        """
        try:
            url = self.construct_playlist_url()
            req = request.get(url)
            open_tag = "<title>"
            end_tag = "</title>"
            matchresult = re.compile(open_tag + "(.+?)" + end_tag)
            matchresult = matchresult.search(req).group()
            matchresult = matchresult.replace(open_tag, "")
            matchresult = matchresult.replace(end_tag, "")
            matchresult = matchresult.replace("- YouTube", "")
            matchresult = matchresult.strip()

            return matchresult
        except Exception as e:
            logger.debug(e)
            return None
예제 #6
0
    def stream_to_buffer(self):
        """Write the media stream to buffer

        :rtype: io.BytesIO buffer
        """
        buffer = io.BytesIO()
        bytes_remaining = self.filesize
        logger.debug(
            'downloading (%s total bytes) file to BytesIO buffer',
            self.filesize,
        )

        for chunk in request.get(self.url, streaming=True):
            # reduce the (bytes) remainder by the length of the chunk.
            bytes_remaining -= len(chunk)
            # send to the on_progress callback.
            self.on_progress(chunk, buffer, bytes_remaining)
        self.on_complete(buffer)
        return buffer
예제 #7
0
 def xml_captions(self):
     """Download the xml caption tracks."""
     return request.get(self.url)
예제 #8
0
    def init(self):
        """Descramble the stream data and build Stream instances.

        The initialization process takes advantage of Python's
        "call-by-reference evaluation," which allows dictionary transforms to
        be applied in-place, instead of holding references to mutations at each
        interstitial step.

        :rtype: None

        """
        logger.info('init started')

        self.vid_info = {k: v for k, v in parse_qsl(self.vid_info)}
        if self.age_restricted:
            self.player_config_args = self.vid_info
        else:
            self.player_config_args = extract.get_ytplayer_config(
                self.watch_html, )['args']

            # ---> ADD THIS PART <---
            if 'title' not in self.player_config_args:
                # for more reliability when parsing, we may use a trained parser
                try:
                    from bs4 import BeautifulSoup
                    soup = BeautifulSoup(self.watch_html, 'lxml')
                    title = soup.title.get_text().strip()
                except ModuleNotFoundError:
                    # since this parsing is actually pretty simple, we may just
                    # parse it using index()
                    i_start = self.watch_html.lower().index('<title>') + len(
                        '<title>')
                    i_end = self.watch_html.lower().index('</title>')
                    title = self.watch_html[i_start:i_end].strip()
                # remove the ' - youtube' part that is added to the browser tab's title
                index = title.lower().rfind(' - youtube')
                title = title[:index] if index > 0 else title
                self.player_config_args['title'] = title
            # / ---> ADD THIS PART <---

        self.vid_descr = extract.get_vid_descr(self.watch_html)
        # https://github.com/nficano/pytube/issues/165
        stream_maps = ['url_encoded_fmt_stream_map']
        if 'adaptive_fmts' in self.player_config_args:
            stream_maps.append('adaptive_fmts')

        # unscramble the progressive and adaptive stream manifests.
        for fmt in stream_maps:
            if not self.age_restricted and fmt in self.vid_info:
                mixins.apply_descrambler(self.vid_info, fmt)
            mixins.apply_descrambler(self.player_config_args, fmt)

            try:
                mixins.apply_signature(self.player_config_args, fmt, self.js)
            except TypeError:
                self.js_url = extract.js_url(
                    self.embed_html,
                    self.age_restricted,
                )
                self.js = request.get(self.js_url)
                mixins.apply_signature(self.player_config_args, fmt, self.js)

            # build instances of :class:`Stream <Stream>`
            self.initialize_stream_objects(fmt)

        # load the player_response object (contains subtitle information)
        apply_mixin(self.player_config_args, 'player_response', json.loads)

        self.initialize_caption_objects()
        logger.info('init finished successfully')