def add_pdf_if_exist(metadata, pres_div): # The markup is not the same if authenticated or not form = pres_div.find('form', id="pdfForm") if form: metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.parse.quote(form.input['value'], safe='') else: a = pres_div.find('a', class_='link-slides') if a: metadata['pdf'] = client.get_url(a['href'])
def add_mp3_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="mp3Form") if form: metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.parse.quote(form.input['value'], safe='') else: a = bc3.find('a', class_='link-mp3') if a: metadata['mp3'] = client.get_url(a['href'])
def add_mp3_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="mp3Form") if form: metadata['mp3'] = client.get_url( '/mp3download.action?filename=') + urllib.parse.quote( form.input['value'], safe='') else: a = bc3.find('a', class_='link-mp3') if a: metadata['mp3'] = client.get_url(a['href'])
def add_pdf_if_exist(metadata, pres_div): # The markup is not the same if authenticated or not form = pres_div.find('form', id="pdfForm") if form: metadata['pdf'] = client.get_url( '/pdfdownload.action?filename=') + urllib.parse.quote( form.input['value'], safe='') else: a = pres_div.find('a', class_='link-slides') if a: metadata['pdf'] = client.get_url(a['href'])
def test_presentation_java_gc_azul(self): p = scrap.Presentation(self.iq, "Java-GC-Azul-C4") self.assertValidPresentationMetadata(p.metadata) self.assertEqual(p.metadata['title'], "Understanding Java Garbage Collection and What You Can Do about It") self.assertEqual(p.metadata['date'], datetime.datetime(2012, 10, 17)) self.assertEqual(p.metadata['auth'], "Gil Tene") #self.assertEqual(p.metadata['duration'], 3469) self.assertEqual(p.metadata['summary'], "Gil Tene explains how a garbage collector works, covering the fundamentals, mechanism, terminology and metrics. He classifies several GCs, and introduces Azul C4.") self.assertEqual(p.metadata['bio'], "Gil Tene is CTO and co-founder of Azul Systems. He has been involved with virtual machine technologies for the past 20 years and has been building Java technology-based products since 1995. Gil pioneered Azul's Continuously Concurrent Compacting Collector (C4), Java Virtualization, Elastic Memory, and various managed runtime and systems stack technologies.") self.assertEqual(p.metadata['about'], 'Software is changing the world; QCon aims to empower software development by facilitating the spread of knowledge and innovation in the enterprise software development community; to achieve this, QCon is organized as a practitioner-driven conference designed for people influencing innovation in their teams: team leads, architects, project managers, engineering directors.') self.assertEqual(p.metadata['timecodes'], [3, 15, 73, 143, 227, 259, 343, 349, 540, 629, 752, 755, 822, 913, 1043, 1210, 1290, 1360, 1386, 1462, 1511, 1633, 1765, 1892, 1975, 2009, 2057, 2111, 2117, 2192, 2269, 2328, 2348, 2468, 2558, 2655, 2666, 2670, 2684, 2758, 2802, 2820, 2827, 2838, 2862, 2913, 2968, 3015, 3056, 3076, 3113, 3115, 3135, 3183, 3187, 3247, 3254, 3281, 3303, 3328, 3344, 3360, 3367, 3376, 3411, 3426, 3469]) self.assertEqual(p.metadata['slides'], [client.get_url("/resource/presentations/Java-GC-Azul-C4/en/slides/%s.swf" % s) for s in list(range(1, 49)) + list(range(50, 51)) + list(range(52, 53)) + list(range(55, 65)) + list(range(66, 72))]) self.assertEqual(p.metadata['video_url'], "rtmpe://video.infoq.com/cfx/st/") self.assertEqual(p.metadata['video_path'], "mp4:presentations/12-jun-everythingyoueverwanted.mp4") self.assertEqual(p.metadata['pdf'], "http://www.infoq.com/pdfdownload.action?filename=presentations%2FQConNY2012-GilTene-EverythingyoueverwantedtoknowaboutJavaCollectionbutweretooafraidtoask.pdf") self.assertEqual(p.metadata['mp3'], "http://www.infoq.com/mp3download.action?filename=presentations%2Finfoq-12-jun-everythingyoueverwanted.mp3")
def test_presentation_java_gc_azul(self): p = presentation.Presentation(self.iq, "Java-GC-Azul-C4") self.assertValidPresentationMetadata(p.metadata) self.assertEqual(p.metadata['title'], "Understanding Java Garbage Collection and What You Can Do about It") self.assertEqual(p.metadata['date'], datetime.datetime(2012, 10, 17)) self.assertEqual(p.metadata['auth'], "Gil Tene") self.assertEqual(p.metadata['duration'], 3469) self.assertEqual(p.metadata['sections'], ['Architecture & Design', 'Development']) self.assertItemsEqual(p.metadata['topics'], ['Azul Zing', 'Azul', 'JVM', 'Virtual Machines', 'Runtimes', 'Java', 'QCon New York 2012', 'GarbageCollection', 'QCon']) self.assertItemsEqual(p.metadata['summary'], "Gil Tene explains how a garbage collector works, covering the fundamentals, mechanism, terminology and metrics. He classifies several GCs, and introduces Azul C4.") self.assertEqual(p.metadata['bio'], "Gil Tene is CTO and co-founder of Azul Systems. He has been involved with virtual machine technologies for the past 20 years and has been building Java technology-based products since 1995. Gil pioneered Azul's Continuously Concurrent Compacting Collector (C4), Java Virtualization, Elastic Memory, and various managed runtime and systems stack technologies.") self.assertEqual(p.metadata['about'], 'Software is changing the world; QCon aims to empower software development by facilitating the spread of knowledge and innovation in the enterprise software development community; to achieve this, QCon is organized as a practitioner-driven conference designed for people influencing innovation in their teams: team leads, architects, project managers, engineering directors.') self.assertEqual(p.metadata['timecodes'], [3, 15, 73, 143, 227, 259, 343, 349, 540, 629, 752, 755, 822, 913, 1043, 1210, 1290, 1360, 1386, 1462, 1511, 1633, 1765, 1892, 1975, 2009, 2057, 2111, 2117, 2192, 2269, 2328, 2348, 2468, 2558, 2655, 2666, 2670, 2684, 2758, 2802, 2820, 2827, 2838, 2862, 2913, 2968, 3015, 3056, 3076, 3113, 3115, 3135, 3183, 3187, 3247, 3254, 3281, 3303, 3328, 3344, 3360, 3367, 3376, 3411, 3426, 3469]) self.assertEqual(p.metadata['slides'], [client.get_url("/resource/presentations/Java-GC-Azul-C4/en/slides/%s.swf" % s) for s in range(1, 49) + range(50, 51) + range(52, 53) + range(55, 65) + range(66, 72)]) self.assertEqual(p.metadata['video_url'], "rtmpe://video.infoq.com/cfx/st/") self.assertEqual(p.metadata['video_path'], "mp4:presentations/12-jun-everythingyoueverwanted.mp4") self.assertEqual(p.metadata['pdf'], "http://www.infoq.com/pdfdownload.action?filename=presentations%2FQConNY2012-GilTene-EverythingyoueverwantedtoknowaboutJavaCollectionbutweretooafraidtoask.pdf") self.assertEqual(p.metadata['mp3'], "http://www.infoq.com/mp3download.action?filename=presentations%2Finfoq-12-jun-everythingyoueverwanted.mp3")
def soup(self): """Download the page and create the soup""" try: return self._soup except AttributeError: url = client.get_url("/presentations/%s" % self.index) content = self.client.fetch_no_cache(url).decode('utf-8') self._soup = bs4.BeautifulSoup(content) return self._soup
def soup(self): """Download the page and create the soup""" try: return self._soup except AttributeError: url = client.get_url("/presentations/%s" % self.index) content = self.client.fetch_no_cache(url).decode('utf-8') self._soup = bs4.BeautifulSoup(content, "html.parser") return self._soup
def soup(self): """Download the page and create the soup""" try: return self._soup except AttributeError: url = client.get_url("/presentations/%s" % (self.index * _RightBarPage.ENTRIES_PER_PAGES)) content = self.client.fetch_no_cache(url).decode('utf-8') self._soup = bs4.BeautifulSoup(content) return self._soup
def test_download(self): p = test.get_latest_presentation(self.iq) self.assert_tmp_dir_is_empty() self.iq.download(p.metadata['slides'][0], self.tmp_dir) self.assert_tmp_dir_nb_files(1) self.iq.download(p.metadata['url'], self.tmp_dir) self.assert_tmp_dir_nb_files(2) with self.assertRaises(client.DownloadError): self.iq.download(client.get_url("/IDONOTEXIST"), self.tmp_dir) self.assert_tmp_dir_nb_files(2)
def test_download(self): p = test.get_latest_presentation(self.iq) self.assert_tmp_dir_is_empty() self.iq.download(p.metadata['slides'][0], self.tmp_dir) self.assert_tmp_dir_nb_files(1) self.iq.download(p.metadata['url'], self.tmp_dir) self.assert_tmp_dir_nb_files(2) with self.assertRaises(client.DownloadError): self.iq.download(client.get_url("/IDONOTEXIST"), self.tmp_dir) self.assert_tmp_dir_nb_files(2)
def soup(self): """Download the page and create the soup""" try: return self._soup except AttributeError: params = { "language": "en", "selectedTab": "PRESENTATION", "startIndex": self.index * _RightBarPage.RIGHT_BAR_ENTRIES_PER_PAGES, } # Do not use iq.fetch to avoid caching since the rightbar is a dynamic page url = client.get_url("/rightbar.action") with contextlib.closing(self.client.opener.open(url, urllib.urlencode(params))) as response: if response.getcode() != 200: raise Exception("Fetching rightbar index %s failed" % self.index) content = response.read().decode('utf-8') self._soup = bs4.BeautifulSoup(content) return self._soup
def metadata(self): def get_title(bc3): return bc3.find('h1').find('a').get_text().strip() def get_date(bc3): txt = bc3.find('div', class_='info').find('strong').next_sibling.strip() mo = re.search("[\w]{2,8}\s+[0-9]{1,2}, [0-9]{4}", txt) return datetime.datetime.strptime(mo.group(0), "%b %d, %Y") def get_author(bc3): return bc3.find('a', class_='editorlink').get_text().strip() def get_duration(bc3): txt = bc3.find('span').get_text().strip() mo = re.search("(\d{2}):(\d{2}):(\d{2})", txt) return int(mo.group(1)) * 60 * 60 + int(mo.group(2)) * 60 + int(mo.group(3)) def get_timecodes(bc3): for script in bc3.find_all('script'): mo = re.search("var\s+TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text()) if mo: return [int(tc) for tc in mo.group(1).split(',')] def get_slides(bc3): for script in bc3.find_all('script'): mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text()) if mo: return [client.get_url(slide.replace('\'', '')) for slide in mo.group(1).split(',')] def get_video(bc3): for script in bc3.find_all('script'): mo = re.search('var jsclassref=\'(.*)\';', script.get_text()) if mo: b64 = mo.group(1) path = base64.b64decode(b64) # Older presentations use flv and the video path does not contain # the extension. Newer presentations use mp4 and include the extension. if path.endswith(".mp4"): return "mp4:%s" % path elif path.endswith(".flv"): return "flv:%s" % path[:-4] else: raise Exception("Unsupported video type: %s" % path) def add_pdf_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="pdfForm") if form: metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.quote(form.input['value'], safe='') else: a = bc3.find('a', class_='link-slides') if a: metadata['pdf'] = client.get_url(a['href']) def add_mp3_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="mp3Form") if form: metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.quote(form.input['value'], safe='') else: a = bc3.find('a', class_='link-mp3') if a: metadata['mp3'] = client.get_url(a['href']) def add_sections_and_topics(metadata, bc3): # Extracting theses two one is quite ugly since there is not clear separation between # sections, topics and advertisement. We need to iterate over all children and maintain a # state to know who is who in_sections = True in_topics = False sections = [] topics = [] for child in bc3.find('dl', class_="tags2").children: if not isinstance(child, bs4.element.Tag): continue if child.name == 'dt' and "topics" in child['class']: if in_topics: break in_sections = False in_topics = True continue if in_sections and child.name == 'dd': sections.append(child.a.get_text().strip()) if in_topics and child.name == 'dd': topics.append(child.a.get_text().strip()) metadata['sections'] = sections metadata['topics'] = topics def add_summary_bio_about(metadata, bc3): content = [] txt = "" for child in bc3.find('div', id="summaryComponent"): if isinstance(child, bs4.element.NavigableString): txt += unicode(child).strip() elif child.name == 'b': content.append(txt) txt = "" continue elif child.name == 'br': continue content.append(txt) metadata['summary'] = content[1] metadata['bio'] = content[2] metadata['about'] = content[3] if not hasattr(self, "_metadata"): box_content_3 = self.soup.find('div', class_='box-content-3') metadata = { 'url': client.get_url("/presentations/" + self.id), 'title': get_title(box_content_3), 'date' : get_date(box_content_3), 'auth' : get_author(box_content_3), 'duration': get_duration(box_content_3), 'timecodes': get_timecodes(box_content_3), 'slides': get_slides(box_content_3), 'video_url': "rtmpe://video.infoq.com/cfx/st/", 'video_path': get_video(box_content_3), } add_sections_and_topics(metadata, box_content_3) add_summary_bio_about(metadata, box_content_3) add_mp3_if_exist(metadata, box_content_3) add_pdf_if_exist(metadata, box_content_3) self._metadata = metadata return self._metadata
def get_url(div): return client.get_url(get_path(div))
def get_slides(bc3): for script in bc3.find_all('script'): mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text()) if mo: return [client.get_url(slide.replace('\'', '')) for slide in mo.group(1).split(',')]
def metadata(self): def get_title(pres_div): return pres_div.find('h1', class_="general").div.get_text().strip() def get_date(pres_div): str = pres_div.find('span', class_='author_general').contents[2] str = str.replace('\n', ' ') str = str.replace(six.u('\xa0'), ' ') str = str.split("on ")[-1] str = str.strip() return datetime.datetime.strptime(str, "%b %d, %Y") def get_author(pres_div): return pres_div.find( 'span', class_='author_general').contents[1].get_text().strip() def get_timecodes(pres_div): for script in pres_div.find_all('script'): mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text()) if mo: return [int(tc) for tc in mo.group(1).split(',')] def get_slides(pres_div): for script in pres_div.find_all('script'): mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text()) if mo: return [ slide.replace('\'', '') for slide in mo.group(1).split(',') ] def get_video(pres_div): for script in pres_div.find_all('script'): mo = re.search('var jsclassref = \'(.*)\';', script.get_text()) if mo: b64 = mo.group(1) path = base64.b64decode(b64).decode('utf-8') # Older presentations use flv and the video path does not contain # the extension. Newer presentations use mp4 and include the extension. if path.endswith(".mp4"): return "mp4:%s" % path elif path.endswith(".flv"): return "flv:%s" % path[:-4] else: raise Exception("Unsupported video type: %s" % path) def get_bio(div): return div.find('p', id="biotext").get_text(strip=True) def get_summary(div): return "".join( div.find('p', id="summary").get_text("|", strip=True).split("|")[1:]) def get_about(div): return div.find('p', id="conference").get_text(strip=True) def add_pdf_if_exist(metadata, pres_div): # The markup is not the same if authenticated or not form = pres_div.find('form', id="pdfForm") if form: metadata['pdf'] = client.get_url( '/pdfdownload.action?filename=') + urllib.parse.quote( form.input['value'], safe='') else: a = pres_div.find('a', class_='link-slides') if a: metadata['pdf'] = client.get_url(a['href']) def add_mp3_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="mp3Form") if form: metadata['mp3'] = client.get_url( '/mp3download.action?filename=') + urllib.parse.quote( form.input['value'], safe='') else: a = bc3.find('a', class_='link-mp3') if a: metadata['mp3'] = client.get_url(a['href']) if not hasattr(self, "_metadata"): pres_div = self.soup.find('div', class_='presentation_full') metadata = { 'url': client.get_url("/presentations/" + self.id), 'title': get_title(pres_div), 'date': get_date(pres_div), 'auth': get_author(pres_div), 'timecodes': get_timecodes(self.soup), 'slides': get_slides(self.soup), 'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"), 'video_path': get_video(self.soup), 'bio': get_bio(pres_div), 'summary': get_summary(pres_div), 'about': get_about(pres_div), } add_mp3_if_exist(metadata, pres_div) add_pdf_if_exist(metadata, pres_div) self._metadata = metadata return self._metadata
def _fetch(self): """Download the page and create the soup""" url = client.get_url("/presentations/" + self.id) content = self.client.fetch_no_cache(url).decode('utf-8') return bs4.BeautifulSoup(content, "html.parser")
def test_fetch_no_cache_error(self): with self.assertRaises(client.DownloadError): self.iq.fetch_no_cache(client.get_url("/IDONOTEXIST"))
def test_fetch_no_cache_error(self): with self.assertRaises(client.DownloadError): self.iq.fetch_no_cache(client.get_url("/IDONOTEXIST"))
def get_url(div): return client.get_url(div.find('h2', class_='itemtitle').a['href'])
def get_url(div): return client.get_url( div.find('h2', class_='itemtitle').a['href'])
def metadata(self): def get_title(pres_div): return pres_div.find('h1', class_="general").div.get_text().strip() def get_date(pres_div): strings = ''.join(pres_div.find('span', class_='author_general').strings) match = re.search('on[\n ]+(.*\d{4})', strings) if match: return datetime.datetime.strptime(match.group(1), "%b %d, %Y") else: raise Exception("Failed to extract date (markup changed?)") def get_author(pres_div): return pres_div.find('span', class_='authors-list').find('a').get_text().strip() def get_timecodes(pres_div): for script in pres_div.find_all('script'): mo = re.search("TIMES\s?=\s?new\s+Array.?\((\d+(,\d+)+)\)", script.get_text()) if mo: return [int(tc) for tc in mo.group(1).split(',')] def get_slides(pres_div): for script in pres_div.find_all('script'): mo = re.search("var\s+slides\s?=\s?new\s+Array.?\(('.+')\)", script.get_text()) if mo: return [slide.replace('\'', '') for slide in mo.group(1).split(',')] def get_video(pres_div): for script in pres_div.find_all('script'): mo = re.search('var jsclassref = \'(.*)\';', script.get_text()) if mo: b64 = mo.group(1) path = base64.b64decode(b64).decode('utf-8') # Older presentations use flv and the video path does not contain # the extension. Newer presentations use mp4 and include the extension. if path.endswith(".mp4"): return "mp4:%s" % path elif path.endswith(".flv"): return "flv:%s" % path[:-4] else: raise Exception("Unsupported video type: %s" % path) def get_bio(div): return div.find('p', id="biotext").get_text(strip=True) def get_summary(div): return "".join(div.find('p', id="summary").get_text("|", strip=True).split("|")[1:]) def get_about(div): return div.find('p', id="conference").get_text(strip=True) def get_demo_timings(pres_div): for script in pres_div.find_all('script'): timings = re.search("demoTimings\s+=\s+'([^']+)", script.get_text()) if timings: return [int(t) for t in timings.group(1).split(',')] return [] def add_pdf_if_exist(metadata, pres_div): # The markup is not the same if authenticated or not form = pres_div.find('form', id="pdfForm") if form: metadata['pdf'] = client.get_url('/pdfdownload.action?filename=') + urllib.parse.quote(form.input['value'], safe='') else: a = pres_div.find('a', class_='link-slides') if a: metadata['pdf'] = client.get_url(a['href']) def add_mp3_if_exist(metadata, bc3): # The markup is not the same if authenticated or not form = bc3.find('form', id="mp3Form") if form: metadata['mp3'] = client.get_url('/mp3download.action?filename=') + urllib.parse.quote(form.input['value'], safe='') else: a = bc3.find('a', class_='link-mp3') if a: metadata['mp3'] = client.get_url(a['href']) if not hasattr(self, "_metadata"): pres_div = self.soup.find('div', class_='presentation_full') metadata = { 'url': client.get_url("/presentations/" + self.id), 'title': get_title(pres_div), 'date' : get_date(pres_div), 'auth' : get_author(pres_div), 'timecodes': get_timecodes(self.soup), 'demo_timings': get_demo_timings(self.soup), 'slides': get_slides(self.soup), 'video_url': six.u("rtmpe://video.infoq.com/cfx/st/"), 'video_path': get_video(self.soup), 'bio': get_bio(pres_div), 'summary': get_summary(pres_div), 'about': get_about(pres_div), } add_mp3_if_exist(metadata, pres_div) add_pdf_if_exist(metadata, pres_div) self._metadata = metadata return self._metadata
def _fetch(self): """Download the page and create the soup""" url = client.get_url("/presentations/" + self.id) content = self.client.fetch_no_cache(url).decode('utf-8') return bs4.BeautifulSoup(content, "html.parser")