Python get_url 예제들, infoqscraper.client.get_url Python 예제들

예제 #1

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

 def add_pdf_if_exist(metadata, pres_div):
     # The markup is not the same if authenticated or not
     form = pres_div.find('form', id="pdfForm")
     if form:
         metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.parse.quote(form.input['value'], safe='')
     else:
         a = pres_div.find('a', class_='link-slides')
         if a:
             metadata['pdf'] = client.get_url(a['href'])

예제 #2

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

 def add_mp3_if_exist(metadata, bc3):
     # The markup is not the same if authenticated or not
     form = bc3.find('form', id="mp3Form")
     if form:
         metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.parse.quote(form.input['value'], safe='')
     else:
         a = bc3.find('a', class_='link-mp3')
         if a:
             metadata['mp3'] = client.get_url(a['href'])

예제 #3

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

 def add_mp3_if_exist(metadata, bc3):
     # The markup is not the same if authenticated or not
     form = bc3.find('form', id="mp3Form")
     if form:
         metadata['mp3'] = client.get_url(
             '/mp3download.action?filename=') + urllib.parse.quote(
                 form.input['value'], safe='')
     else:
         a = bc3.find('a', class_='link-mp3')
         if a:
             metadata['mp3'] = client.get_url(a['href'])

예제 #4

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

 def add_pdf_if_exist(metadata, pres_div):
     # The markup is not the same if authenticated or not
     form = pres_div.find('form', id="pdfForm")
     if form:
         metadata['pdf'] = client.get_url(
             '/pdfdownload.action?filename=') + urllib.parse.quote(
                 form.input['value'], safe='')
     else:
         a = pres_div.find('a', class_='link-slides')
         if a:
             metadata['pdf'] = client.get_url(a['href'])

예제 #5

0

파일 보기

    def test_presentation_java_gc_azul(self):
        p = scrap.Presentation(self.iq, "Java-GC-Azul-C4")

        self.assertValidPresentationMetadata(p.metadata)

        self.assertEqual(p.metadata['title'], "Understanding Java Garbage Collection and What You Can Do about It")
        self.assertEqual(p.metadata['date'], datetime.datetime(2012, 10, 17))
        self.assertEqual(p.metadata['auth'], "Gil Tene")
        #self.assertEqual(p.metadata['duration'], 3469)
        self.assertEqual(p.metadata['summary'],
                         "Gil Tene explains how a garbage collector works, covering the fundamentals, mechanism, terminology and metrics. He classifies several GCs, and introduces Azul C4.")
        self.assertEqual(p.metadata['bio'],
                         "Gil Tene is CTO and co-founder of Azul Systems. He has been involved with virtual machine technologies for the past 20 years and has been building Java technology-based products since 1995. Gil pioneered Azul's Continuously Concurrent Compacting Collector (C4), Java Virtualization, Elastic Memory, and various managed runtime and systems stack technologies.")
        self.assertEqual(p.metadata['about'],
                         'Software is changing the world; QCon aims to empower software development by facilitating the spread of knowledge and innovation in the enterprise software development community; to achieve this, QCon is organized as a practitioner-driven conference designed for people influencing innovation in their teams: team leads, architects, project managers, engineering directors.')
        self.assertEqual(p.metadata['timecodes'],
                         [3, 15, 73, 143, 227, 259, 343, 349, 540, 629, 752, 755, 822, 913, 1043, 1210, 1290, 1360, 1386,
                          1462, 1511, 1633, 1765, 1892, 1975, 2009, 2057, 2111, 2117, 2192, 2269, 2328, 2348, 2468, 2558,
                          2655, 2666, 2670, 2684, 2758, 2802, 2820, 2827, 2838, 2862, 2913, 2968, 3015, 3056, 3076, 3113,
                          3115, 3135, 3183, 3187, 3247, 3254, 3281, 3303, 3328, 3344, 3360, 3367, 3376, 3411, 3426, 3469])
        self.assertEqual(p.metadata['slides'],
                         [client.get_url("/resource/presentations/Java-GC-Azul-C4/en/slides/%s.swf" % s) for s in
                          list(range(1, 49)) + list(range(50, 51)) + list(range(52, 53)) + list(range(55, 65)) + list(range(66, 72))])
        self.assertEqual(p.metadata['video_url'],
                         "rtmpe://video.infoq.com/cfx/st/")
        self.assertEqual(p.metadata['video_path'],
                         "mp4:presentations/12-jun-everythingyoueverwanted.mp4")
        self.assertEqual(p.metadata['pdf'],
                         "http://www.infoq.com/pdfdownload.action?filename=presentations%2FQConNY2012-GilTene-EverythingyoueverwantedtoknowaboutJavaCollectionbutweretooafraidtoask.pdf")
        self.assertEqual(p.metadata['mp3'],
                         "http://www.infoq.com/mp3download.action?filename=presentations%2Finfoq-12-jun-everythingyoueverwanted.mp3")

예제 #6

0

파일 보기

파일: test_presentation.py 프로젝트: gauravagarwalr/infoqscraper

    def test_presentation_java_gc_azul(self):
        p = presentation.Presentation(self.iq, "Java-GC-Azul-C4")

        self.assertValidPresentationMetadata(p.metadata)

        self.assertEqual(p.metadata['title'], "Understanding Java Garbage Collection and What You Can Do about It")
        self.assertEqual(p.metadata['date'], datetime.datetime(2012, 10, 17))
        self.assertEqual(p.metadata['auth'], "Gil Tene")
        self.assertEqual(p.metadata['duration'], 3469)
        self.assertEqual(p.metadata['sections'], ['Architecture & Design', 'Development'])
        self.assertItemsEqual(p.metadata['topics'],
            ['Azul Zing', 'Azul', 'JVM', 'Virtual Machines', 'Runtimes', 'Java', 'QCon New York 2012', 'GarbageCollection', 'QCon'])
        self.assertItemsEqual(p.metadata['summary'],
            "Gil Tene explains how a garbage collector works, covering the fundamentals, mechanism, terminology and metrics. He classifies several GCs, and introduces Azul C4.")
        self.assertEqual(p.metadata['bio'],
            "Gil Tene is CTO and co-founder of Azul Systems. He has been involved with virtual machine technologies for the past 20 years and has been building Java technology-based products since 1995. Gil pioneered Azul's Continuously Concurrent Compacting Collector (C4), Java Virtualization, Elastic Memory, and various managed runtime and systems stack technologies.")
        self.assertEqual(p.metadata['about'],
            'Software is changing the world; QCon aims to empower software development by facilitating the spread of knowledge and innovation in the enterprise software development community; to achieve this, QCon is organized as a practitioner-driven conference designed for people influencing innovation in their teams: team leads, architects, project managers, engineering directors.')
        self.assertEqual(p.metadata['timecodes'],
            [3, 15, 73, 143, 227, 259, 343, 349, 540, 629, 752, 755, 822, 913, 1043, 1210, 1290, 1360, 1386,
             1462, 1511, 1633, 1765, 1892, 1975, 2009, 2057, 2111, 2117, 2192, 2269, 2328, 2348, 2468, 2558,
             2655, 2666, 2670, 2684, 2758, 2802, 2820, 2827, 2838, 2862, 2913, 2968, 3015, 3056, 3076, 3113,
             3115, 3135, 3183, 3187, 3247, 3254, 3281, 3303, 3328, 3344, 3360, 3367, 3376, 3411, 3426, 3469])
        self.assertEqual(p.metadata['slides'],
            [client.get_url("/resource/presentations/Java-GC-Azul-C4/en/slides/%s.swf" % s) for s in
             range(1, 49) + range(50, 51) + range(52, 53) + range(55, 65) + range(66, 72)])
        self.assertEqual(p.metadata['video_url'],
            "rtmpe://video.infoq.com/cfx/st/")
	self.assertEqual(p.metadata['video_path'],
	    "mp4:presentations/12-jun-everythingyoueverwanted.mp4")
        self.assertEqual(p.metadata['pdf'],
            "http://www.infoq.com/pdfdownload.action?filename=presentations%2FQConNY2012-GilTene-EverythingyoueverwantedtoknowaboutJavaCollectionbutweretooafraidtoask.pdf")
        self.assertEqual(p.metadata['mp3'],
            "http://www.infoq.com/mp3download.action?filename=presentations%2Finfoq-12-jun-everythingyoueverwanted.mp3")

예제 #7

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

    def soup(self):
        """Download the page and create the soup"""
        try:
            return self._soup
        except AttributeError:
            url = client.get_url("/presentations/%s" % self.index)
            content = self.client.fetch_no_cache(url).decode('utf-8')
            self._soup = bs4.BeautifulSoup(content)

            return self._soup

예제 #8

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

    def soup(self):
        """Download the page and create the soup"""
        try:
            return self._soup
        except AttributeError:
            url = client.get_url("/presentations/%s" % self.index)
            content = self.client.fetch_no_cache(url).decode('utf-8')
            self._soup = bs4.BeautifulSoup(content, "html.parser")

            return self._soup

예제 #9

0

파일 보기

파일: presentation.py 프로젝트: ljcoomber/infoqscraper

    def soup(self):
        """Download the page and create the soup"""
        try:
            return self._soup
        except AttributeError:
            url = client.get_url("/presentations/%s" % (self.index * _RightBarPage.ENTRIES_PER_PAGES))
            content = self.client.fetch_no_cache(url).decode('utf-8')
            self._soup = bs4.BeautifulSoup(content)

            return self._soup

예제 #10

0

파일 보기

    def test_download(self):
        p = test.get_latest_presentation(self.iq)

        self.assert_tmp_dir_is_empty()
        self.iq.download(p.metadata['slides'][0], self.tmp_dir)
        self.assert_tmp_dir_nb_files(1)
        self.iq.download(p.metadata['url'], self.tmp_dir)
        self.assert_tmp_dir_nb_files(2)
        with self.assertRaises(client.DownloadError):
            self.iq.download(client.get_url("/IDONOTEXIST"), self.tmp_dir)
        self.assert_tmp_dir_nb_files(2)

예제 #11

0

파일 보기

파일: test_client.py 프로젝트: gauravagarwalr/infoqscraper

    def test_download(self):
        p = test.get_latest_presentation(self.iq)

        self.assert_tmp_dir_is_empty()
        self.iq.download(p.metadata['slides'][0], self.tmp_dir)
        self.assert_tmp_dir_nb_files(1)
        self.iq.download(p.metadata['url'], self.tmp_dir)
        self.assert_tmp_dir_nb_files(2)
        with self.assertRaises(client.DownloadError):
            self.iq.download(client.get_url("/IDONOTEXIST"), self.tmp_dir)
        self.assert_tmp_dir_nb_files(2)

예제 #12

0

파일 보기

파일: presentation.py 프로젝트: gauravagarwalr/infoqscraper

    def soup(self):
        """Download the page and create the soup"""
        try:
            return self._soup
        except AttributeError:
            params = {
                "language": "en",
                "selectedTab": "PRESENTATION",
                "startIndex": self.index * _RightBarPage.RIGHT_BAR_ENTRIES_PER_PAGES,
                }
            # Do not use iq.fetch to avoid caching since the rightbar is a dynamic page
            url = client.get_url("/rightbar.action")
            with contextlib.closing(self.client.opener.open(url, urllib.urlencode(params))) as response:
                if response.getcode() != 200:
                    raise Exception("Fetching rightbar index %s failed" % self.index)
                content = response.read().decode('utf-8')

                self._soup = bs4.BeautifulSoup(content)

            return self._soup

예제 #13

0

파일 보기

파일: presentation.py 프로젝트: gauravagarwalr/infoqscraper

    def metadata(self):
        def get_title(bc3):
            return bc3.find('h1').find('a').get_text().strip()

        def get_date(bc3):
            txt = bc3.find('div', class_='info').find('strong').next_sibling.strip()
            mo = re.search("[\w]{2,8}\s+[0-9]{1,2}, [0-9]{4}", txt)
            return datetime.datetime.strptime(mo.group(0), "%b %d, %Y")

        def get_author(bc3):
            return bc3.find('a', class_='editorlink').get_text().strip()

        def get_duration(bc3):
            txt = bc3.find('span').get_text().strip()
            mo  = re.search("(\d{2}):(\d{2}):(\d{2})", txt)
            return int(mo.group(1)) * 60 * 60 + int(mo.group(2)) * 60 + int(mo.group(3))

        def get_timecodes(bc3):
            for script in bc3.find_all('script'):
                mo = re.search("var\s+TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text())
                if mo:
                    return [int(tc) for tc in  mo.group(1).split(',')]

        def get_slides(bc3):
            for script in bc3.find_all('script'):
                mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text())
                if mo:
                    return [client.get_url(slide.replace('\'', '')) for slide in  mo.group(1).split(',')]

        def get_video(bc3):
            for script in bc3.find_all('script'):
                mo = re.search('var jsclassref=\'(.*)\';', script.get_text())
                if mo:
                    b64 = mo.group(1)
                    path = base64.b64decode(b64)
                    # Older presentations use flv and the video path does not contain
                    # the extension. Newer presentations use mp4 and include the extension.
                    if path.endswith(".mp4"):
                        return "mp4:%s" % path
                    elif path.endswith(".flv"):
                        return "flv:%s" % path[:-4]
                    else:
                        raise Exception("Unsupported video type: %s" % path)


        def add_pdf_if_exist(metadata, bc3):
            # The markup is not the same if authenticated or not
            form = bc3.find('form', id="pdfForm")
            if form:
                metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.quote(form.input['value'], safe='')
            else:
                a = bc3.find('a', class_='link-slides')
                if a:
                    metadata['pdf'] = client.get_url(a['href'])

        def add_mp3_if_exist(metadata, bc3):
            # The markup is not the same if authenticated or not
            form = bc3.find('form', id="mp3Form")
            if form:
                metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.quote(form.input['value'], safe='')
            else:
                a = bc3.find('a', class_='link-mp3')
                if a:
                    metadata['mp3'] = client.get_url(a['href'])

        def add_sections_and_topics(metadata, bc3):
            # Extracting theses two one is quite ugly since there is not clear separation between
            # sections, topics and advertisement. We need to iterate over all children and maintain a
            # state to know who is who
            in_sections = True
            in_topics = False

            sections = []
            topics = []

            for child in bc3.find('dl', class_="tags2").children:
                if not isinstance(child, bs4.element.Tag):
                    continue

                if child.name == 'dt' and "topics" in child['class']:
                    if in_topics:
                        break

                    in_sections = False
                    in_topics = True
                    continue

                if in_sections and child.name == 'dd':
                    sections.append(child.a.get_text().strip())

                if in_topics and child.name == 'dd':
                    topics.append(child.a.get_text().strip())

            metadata['sections'] = sections
            metadata['topics'] = topics

        def add_summary_bio_about(metadata, bc3):
            content = []

            txt = ""
            for child in bc3.find('div', id="summaryComponent"):
                if isinstance(child, bs4.element.NavigableString):
                    txt += unicode(child).strip()
                elif child.name == 'b':
                    content.append(txt)
                    txt = ""
                    continue
                elif child.name == 'br':
                    continue
            content.append(txt)

            metadata['summary'] = content[1]
            metadata['bio']     = content[2]
            metadata['about']   = content[3]

        if not hasattr(self, "_metadata"):
            box_content_3 = self.soup.find('div', class_='box-content-3')
            metadata = {
                'url': client.get_url("/presentations/" + self.id),
                'title': get_title(box_content_3),
                'date' : get_date(box_content_3),
                'auth' : get_author(box_content_3),
                'duration': get_duration(box_content_3),
                'timecodes': get_timecodes(box_content_3),
                'slides': get_slides(box_content_3),
                'video_url': "rtmpe://video.infoq.com/cfx/st/",
                'video_path': get_video(box_content_3),
                }
            add_sections_and_topics(metadata, box_content_3)
            add_summary_bio_about(metadata, box_content_3)
            add_mp3_if_exist(metadata, box_content_3)
            add_pdf_if_exist(metadata, box_content_3)

            self._metadata = metadata

        return self._metadata

예제 #14

0

파일 보기

파일: presentation.py 프로젝트: gauravagarwalr/infoqscraper

 def get_url(div):
     return client.get_url(get_path(div))

예제 #15

0

파일 보기

파일: presentation.py 프로젝트: gauravagarwalr/infoqscraper

 def get_slides(bc3):
     for script in bc3.find_all('script'):
         mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text())
         if mo:
             return [client.get_url(slide.replace('\'', '')) for slide in  mo.group(1).split(',')]

예제 #16

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

    def metadata(self):
        def get_title(pres_div):
            return pres_div.find('h1', class_="general").div.get_text().strip()

        def get_date(pres_div):
            str = pres_div.find('span', class_='author_general').contents[2]
            str = str.replace('\n', ' ')
            str = str.replace(six.u('\xa0'), ' ')
            str = str.split("on ")[-1]
            str = str.strip()
            return datetime.datetime.strptime(str, "%b %d, %Y")

        def get_author(pres_div):
            return pres_div.find(
                'span',
                class_='author_general').contents[1].get_text().strip()

        def get_timecodes(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)",
                               script.get_text())
                if mo:
                    return [int(tc) for tc in mo.group(1).split(',')]

        def get_slides(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)",
                               script.get_text())
                if mo:
                    return [
                        slide.replace('\'', '')
                        for slide in mo.group(1).split(',')
                    ]

        def get_video(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search('var jsclassref = \'(.*)\';', script.get_text())
                if mo:
                    b64 = mo.group(1)
                    path = base64.b64decode(b64).decode('utf-8')
                    # Older presentations use flv and the video path does not contain
                    # the extension. Newer presentations use mp4 and include the extension.
                    if path.endswith(".mp4"):
                        return "mp4:%s" % path
                    elif path.endswith(".flv"):
                        return "flv:%s" % path[:-4]
                    else:
                        raise Exception("Unsupported video type: %s" % path)

        def get_bio(div):
            return div.find('p', id="biotext").get_text(strip=True)

        def get_summary(div):
            return "".join(
                div.find('p',
                         id="summary").get_text("|",
                                                strip=True).split("|")[1:])

        def get_about(div):
            return div.find('p', id="conference").get_text(strip=True)

        def add_pdf_if_exist(metadata, pres_div):
            # The markup is not the same if authenticated or not
            form = pres_div.find('form', id="pdfForm")
            if form:
                metadata['pdf'] = client.get_url(
                    '/pdfdownload.action?filename=') + urllib.parse.quote(
                        form.input['value'], safe='')
            else:
                a = pres_div.find('a', class_='link-slides')
                if a:
                    metadata['pdf'] = client.get_url(a['href'])

        def add_mp3_if_exist(metadata, bc3):
            # The markup is not the same if authenticated or not
            form = bc3.find('form', id="mp3Form")
            if form:
                metadata['mp3'] = client.get_url(
                    '/mp3download.action?filename=') + urllib.parse.quote(
                        form.input['value'], safe='')
            else:
                a = bc3.find('a', class_='link-mp3')
                if a:
                    metadata['mp3'] = client.get_url(a['href'])

        if not hasattr(self, "_metadata"):
            pres_div = self.soup.find('div', class_='presentation_full')
            metadata = {
                'url': client.get_url("/presentations/" + self.id),
                'title': get_title(pres_div),
                'date': get_date(pres_div),
                'auth': get_author(pres_div),
                'timecodes': get_timecodes(self.soup),
                'slides': get_slides(self.soup),
                'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"),
                'video_path': get_video(self.soup),
                'bio': get_bio(pres_div),
                'summary': get_summary(pres_div),
                'about': get_about(pres_div),
            }
            add_mp3_if_exist(metadata, pres_div)
            add_pdf_if_exist(metadata, pres_div)

            self._metadata = metadata

        return self._metadata

예제 #17

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

 def _fetch(self):
     """Download the page and create the soup"""
     url = client.get_url("/presentations/" + self.id)
     content = self.client.fetch_no_cache(url).decode('utf-8')
     return bs4.BeautifulSoup(content, "html.parser")

예제 #18

0

파일 보기

 def test_fetch_no_cache_error(self):
     with self.assertRaises(client.DownloadError):
         self.iq.fetch_no_cache(client.get_url("/IDONOTEXIST"))

예제 #19

0

파일 보기

파일: test_client.py 프로젝트: gauravagarwalr/infoqscraper

 def test_fetch_no_cache_error(self):
     with self.assertRaises(client.DownloadError):
         self.iq.fetch_no_cache(client.get_url("/IDONOTEXIST"))

예제 #20

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

 def get_url(div):
     return client.get_url(div.find('h2', class_='itemtitle').a['href'])

예제 #21

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

 def get_url(div):
     return client.get_url(
         div.find('h2', class_='itemtitle').a['href'])

예제 #22

0

파일 보기

파일: scrap.py 프로젝트: cykl/infoqscraper

    def metadata(self):
        def get_title(pres_div):
            return pres_div.find('h1', class_="general").div.get_text().strip()

        def get_date(pres_div):
            strings = ''.join(pres_div.find('span', class_='author_general').strings)
            match = re.search('on[\n ]+(.*\d{4})', strings)
            if match:
                return datetime.datetime.strptime(match.group(1), "%b %d, %Y")
            else:
                raise Exception("Failed to extract date (markup changed?)")

        def get_author(pres_div):
            return pres_div.find('span', class_='authors-list').find('a').get_text().strip()

        def get_timecodes(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text())
                if mo:
                    return [int(tc) for tc in mo.group(1).split(',')]

        def get_slides(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text())
                if mo:
                    return [slide.replace('\'', '') for slide in  mo.group(1).split(',')]

        def get_video(pres_div):
            for script in pres_div.find_all('script'):
                mo = re.search('var jsclassref = \'(.*)\';', script.get_text())
                if mo:
                    b64 = mo.group(1)
                    path = base64.b64decode(b64).decode('utf-8')
                    # Older presentations use flv and the video path does not contain
                    # the extension. Newer presentations use mp4 and include the extension.
                    if path.endswith(".mp4"):
                        return "mp4:%s" % path
                    elif path.endswith(".flv"):
                        return "flv:%s" % path[:-4]
                    else:
                        raise Exception("Unsupported video type: %s" % path)

        def get_bio(div):
            return div.find('p', id="biotext").get_text(strip=True)

        def get_summary(div):
            return "".join(div.find('p', id="summary").get_text("|", strip=True).split("|")[1:])

        def get_about(div):
            return div.find('p', id="conference").get_text(strip=True)

        def get_demo_timings(pres_div):
            for script in pres_div.find_all('script'):
                timings = re.search("demoTimings\s+=\s+'([^']+)", script.get_text())
                if timings:
                    return [int(t) for t in timings.group(1).split(',')]
            return []

        def add_pdf_if_exist(metadata, pres_div):
            # The markup is not the same if authenticated or not
            form = pres_div.find('form', id="pdfForm")
            if form:
                metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.parse.quote(form.input['value'], safe='')
            else:
                a = pres_div.find('a', class_='link-slides')
                if a:
                    metadata['pdf'] = client.get_url(a['href'])

        def add_mp3_if_exist(metadata, bc3):
            # The markup is not the same if authenticated or not
            form = bc3.find('form', id="mp3Form")
            if form:
                metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.parse.quote(form.input['value'], safe='')
            else:
                a = bc3.find('a', class_='link-mp3')
                if a:
                    metadata['mp3'] = client.get_url(a['href'])

        if not hasattr(self, "_metadata"):
            pres_div = self.soup.find('div', class_='presentation_full')
            metadata = {
                'url': client.get_url("/presentations/" + self.id),
                'title': get_title(pres_div),
                'date' : get_date(pres_div),
                'auth' : get_author(pres_div),
                'timecodes': get_timecodes(self.soup),
                'demo_timings': get_demo_timings(self.soup),
                'slides': get_slides(self.soup),
                'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"),
                'video_path': get_video(self.soup),
                'bio':        get_bio(pres_div),
                'summary':    get_summary(pres_div),
                'about':      get_about(pres_div),

                }
            add_mp3_if_exist(metadata, pres_div)
            add_pdf_if_exist(metadata, pres_div)

            self._metadata = metadata

        return self._metadata

예제 #23

0

파일 보기

파일: scrap.py 프로젝트: zerkms/infoqscraper

 def _fetch(self):
     """Download the page and create the soup"""
     url = client.get_url("/presentations/" + self.id)
     content = self.client.fetch_no_cache(url).decode('utf-8')
     return bs4.BeautifulSoup(content, "html.parser")