def _copydoc(): """ Example view demonstrating rendering a simple HTML page. """ with open(app_config.TRANSCRIPT_HTML_PATH) as f: html = f.read() doc = CopyDoc(html) context = {'doc': doc} return make_response(render_template('copydoc.html', **context))
def _copydoc(filename): """ Example view demonstrating rendering a simple HTML page. """ key = filename.split('.')[0] if not os.path.exists('data/%s' % filename): abort(404) with open(app_config.EPISODE_DOCUMENTS[key]['path']) as f: html = f.read() doc = CopyDoc(html) context = { 'doc': doc } return make_response(render_template('copydoc.html', **context))
class CopyDocSpaces(unittest.TestCase): """ Test bootstrapping postgres database """ def setUp(self): with open('tests/transcript_with_embed.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.body = self.parser.soup.body def test_num_lines(self): self.assertEqual(len(self.body.contents), 4) def test_iframe_markup(self): self.assertTrue( '<iframe width="560" height="315" src="https://www.youtube.com/embed/dZTKOBElkyg" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__())
def _episode(filename): """ Example view demonstrating rendering a simple HTML page. """ key = filename.split('.')[0] if not os.path.exists('data/%s' % filename): abort(404) with open(app_config.EPISODE_DOCUMENTS[key]['path']) as f: html = f.read() context = make_context() doc = CopyDoc(html) parsed_document = parse_doc.parse(doc) context.update(parsed_document) context.update({ 'episode': key, 'next': app_config.EPISODE_DOCUMENTS[key]['next']}) return make_response(render_template('episode.html', **context))
def parse_document(html): doc = CopyDoc(html) parsed_document = parse_doc.parse(doc) return parsed_document
def lambda_handler(event, context): """ Retrieves drive keys from the request payload - connects to google using authomatic and OAuth2 credentials - parses the factcheck document and publishes to staging """ try: try: logger.info('Start preview generation') TRANSCRIPT_GDOC_KEY = event['doc_key'] AUTHORS_GOOGLE_DOC_KEY = event['authors_key'] except KeyError: msg = 'Did not find needed params in %s' % (event) raise app_config.UserException('[BadRequest]: %s' % msg) authors_url = app_config.SPREADSHEET_URL_TEMPLATE % ( AUTHORS_GOOGLE_DOC_KEY) doc_url = app_config.DOC_URL_TEMPLATE % (TRANSCRIPT_GDOC_KEY) # Get the credentials and refresh if necesary credentials = app_config.authomatic.credentials( app_config.GOOGLE_CREDS) # Refresh credentials if needed if not credentials.valid: credentials.refresh() # Get authors response = app_config.authomatic.access(credentials, authors_url) if response.status != 200: msg = 'While accessing %s got HTTP: %s' % (authors_url, response.status) raise app_config.UserException('[BadRequest]: %s' % msg) authors_data = response.content authors = transform_authors(authors_data) if not authors: msg = 'Could not parse authors spreadsheet %s' % (authors_url) raise app_config.UserException('[BadRequest]: %s' % msg) # Get doccument response = app_config.authomatic.access(credentials, doc_url) if response.status != 200: msg = 'While accessing %s got HTTP: %s' % (doc_url, response.status) raise app_config.UserException('[BadRequest]: %s' % msg) html = response.content # Parse data doc = CopyDoc(html) logger.info('Parsed doc html with copydoc') context = parse_doc.parse(doc, authors) logger.info('Parsed factcheck') # Generate final files and upload to S3 upload_template_contents(context, 'factcheck.html') upload_template_contents(context, 'share.html') context['preview'] = True upload_template_contents(context, 'factcheck.html', 'factcheck_preview.html') logger.info('Generated factcheck templates. Execution successful') return {'message': 'Preview generated successfully'} except app_config.UserException, e: logger.error('Exit with controlled exception %s' % e) raise
def setUp(self): with open('tests/testdoc.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.contents = self.parser.soup.body.contents
class CopyDocTestCase(unittest.TestCase): """ Test bootstrapping postgres database """ def setUp(self): with open('tests/testdoc.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.contents = self.parser.soup.body.contents def test_num_lines(self): self.assertEqual(len(self.contents), 19) def test_h1(self): self._is_tag(self.contents[0], 'h1') def test_h1_has_no_children(self): child_length = len(self.contents[0].find_all()) self.assertEqual(child_length, 0) def test_h2(self): self._is_tag(self.contents[1], 'h2') def test_h3(self): self._is_tag(self.contents[2], 'h3') def test_p(self): self._is_tag(self.contents[3], 'p') def test_strong(self): self._contains_tag(self.contents[4], 'strong') def test_em(self): self._contains_tag(self.contents[5], 'em') def test_u(self): self._contains_tag(self.contents[6], 'u') def test_ignore_html(self): self._contains_tag(self.contents[7], 'strong', 0) def test_a(self): self._contains_tag(self.contents[8], 'a') def test_a_count(self): tags = self.parser.soup.body.findAll('a') self.assertEqual(len(tags), 2) def test_ahref(self): href = self.contents[8].a.attrs['href'][0] self.assertEqual(href, 'http://npr.org') def test_ul(self): self._is_tag(self.contents[9], 'ul') def test_ul_li(self): count_li = len(self.contents[9].find_all('li')) self.assertEqual(count_li, 3) def test_ol(self): self._is_tag(self.contents[10], 'ol') def test_ol_li(self): count_li = len(self.contents[10].find_all('li')) self.assertEqual(count_li, 3) def test_img(self): self._contains_tag(self.contents[11], 'img') def test_strange_has_no_children(self): child_length = len(self.contents[12].find_all()) self.assertEqual(child_length, 0) def test_strange_has_extra_space_bug(self): clean_string = self.parser.clean_linebreaks(self.contents[12]) expected_string = '<p>Strange formatting</p>' self.assertEqual(clean_string, expected_string) def test_tabletag(self): self._is_tag(self.contents[13], 'table') def test_tabletd(self): self._contains_tag(self.contents[13], 'td', 4) def test_tabletr(self): self._contains_tag(self.contents[13], 'tr', 2) def test_anchortag_combination(self): self._contains_tag(self.contents[15], 'a') def test_headline_extraction(self): self.assertEqual(self.parser.headline, 'this is a headline') def test_subhed_extraction(self): self.assertEqual(self.parser.subhed, 'this is a subhed') def test_banner_extraction(self): self.assertEqual(self.parser.banner, 'this is a banner') def test_image_extraction(self): self.assertEqual(self.parser.image, 'http://media.npr.org/assets/img/2015/12/29/gettyimages-477258926_wide-s700-c85.jpg') def test_mobile_image_extraction(self): self.assertEqual(self.parser.mobile_image, 'https://media.giphy.com/media/3oEdv5FXteGY8iS8CY/giphy.gif') def test_audio_url_extraction(self): self.assertEqual(self.parser.audio_url, 'http://play.podtrac.com/npr-510310/npr.mc.tritondigital.com/NPR_510310/media/anon.npr-mp3/npr/nprpolitics/2016/02/20160205_nprpolitics_roundup.mp3?orgId=1&d=2261&p=510310&story=465741966&t=podcast&e=465741966&ft=pod&f=510310') def test_credit_extraction(self): self.assertEqual(self.parser.credit, 'this is a photo credit') def test_mobile_credit_extraction(self): self.assertEqual(self.parser.mobile_credit, 'this is a mobile photo credit') def test_iframe_markup(self): self.assertTrue('<iframe width="560" height="315" src="https://www.youtube.com/embed/659pppwniXA" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__()) def test_nbsp_markup(self): self.assertTrue('This is a paragraph with a non-breaking space.' in self.parser.__unicode__()) def spaces_stripped(self): clean_string = self.parser.clean_linebreaks(self.contents[17]) expected_string = '<p>This is a paragraph with multiple spaces.</p>' self.assertEqual(child_length, 0) def _is_tag(self, tag, tag_name): self.assertEqual(tag.name, tag_name) def _contains_tag(self, tag, tag_name, count=1): child_length = len(tag.findAll(tag_name)) self.assertEqual(child_length, count)
def setUp(self): with open('tests/transcript_with_embed.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.body = self.parser.soup.body
class CopyDocSpaces(unittest.TestCase): """ Test bootstrapping postgres database """ def setUp(self): with open('tests/transcript_with_embed.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.body = self.parser.soup.body def test_num_lines(self): self.assertEqual(len(self.body.contents), 4) def test_iframe_markup(self): self.assertTrue('<iframe width="560" height="315" src="https://www.youtube.com/embed/dZTKOBElkyg" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__())
class CopyDocTestCase(unittest.TestCase): """ Test bootstrapping postgres database """ def setUp(self): with open('tests/testdoc.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.contents = self.parser.soup.body.contents def test_num_lines(self): self.assertEqual(len(self.contents), 19) def test_h1(self): self._is_tag(self.contents[0], 'h1') def test_h1_has_no_children(self): child_length = len(self.contents[0].find_all()) self.assertEqual(child_length, 0) def test_h2(self): self._is_tag(self.contents[1], 'h2') def test_h3(self): self._is_tag(self.contents[2], 'h3') def test_p(self): self._is_tag(self.contents[3], 'p') def test_strong(self): self._contains_tag(self.contents[4], 'strong') def test_em(self): self._contains_tag(self.contents[5], 'em') def test_u(self): self._contains_tag(self.contents[6], 'u') def test_ignore_html(self): self._contains_tag(self.contents[7], 'strong', 0) def test_a(self): self._contains_tag(self.contents[8], 'a') def test_a_count(self): tags = self.parser.soup.body.findAll('a') self.assertEqual(len(tags), 2) def test_ahref(self): href = self.contents[8].a.attrs['href'][0] self.assertEqual(href, 'http://npr.org') def test_ul(self): self._is_tag(self.contents[9], 'ul') def test_ul_li(self): count_li = len(self.contents[9].find_all('li')) self.assertEqual(count_li, 3) def test_ol(self): self._is_tag(self.contents[10], 'ol') def test_ol_li(self): count_li = len(self.contents[10].find_all('li')) self.assertEqual(count_li, 3) def test_img(self): self._contains_tag(self.contents[11], 'img') def test_strange_has_no_children(self): child_length = len(self.contents[12].find_all()) self.assertEqual(child_length, 0) def test_strange_has_extra_space_bug(self): clean_string = self.parser.clean_linebreaks(self.contents[12]) expected_string = '<p>Strange formatting</p>' self.assertEqual(clean_string, expected_string) def test_tabletag(self): self._is_tag(self.contents[13], 'table') def test_tabletd(self): self._contains_tag(self.contents[13], 'td', 4) def test_tabletr(self): self._contains_tag(self.contents[13], 'tr', 2) def test_anchortag_combination(self): self._contains_tag(self.contents[15], 'a') def test_headline_extraction(self): self.assertEqual(self.parser.headline, 'this is a headline') def test_subhed_extraction(self): self.assertEqual(self.parser.subhed, 'this is a subhed') def test_banner_extraction(self): self.assertEqual(self.parser.banner, 'this is a banner') def test_image_extraction(self): self.assertEqual( self.parser.image, 'http://media.npr.org/assets/img/2015/12/29/gettyimages-477258926_wide-s700-c85.jpg' ) def test_mobile_image_extraction(self): self.assertEqual( self.parser.mobile_image, 'https://media.giphy.com/media/3oEdv5FXteGY8iS8CY/giphy.gif') def test_audio_url_extraction(self): self.assertEqual( self.parser.audio_url, 'http://play.podtrac.com/npr-510310/npr.mc.tritondigital.com/NPR_510310/media/anon.npr-mp3/npr/nprpolitics/2016/02/20160205_nprpolitics_roundup.mp3?orgId=1&d=2261&p=510310&story=465741966&t=podcast&e=465741966&ft=pod&f=510310' ) def test_credit_extraction(self): self.assertEqual(self.parser.credit, 'this is a photo credit') def test_mobile_credit_extraction(self): self.assertEqual(self.parser.mobile_credit, 'this is a mobile photo credit') def test_iframe_markup(self): self.assertTrue( '<iframe width="560" height="315" src="https://www.youtube.com/embed/659pppwniXA" frameborder="0" allowfullscreen></iframe>' in self.parser.__unicode__()) def test_nbsp_markup(self): self.assertTrue('This is a paragraph with a non-breaking space.' in self.parser.__unicode__()) def spaces_stripped(self): clean_string = self.parser.clean_linebreaks(self.contents[17]) expected_string = '<p>This is a paragraph with multiple spaces.</p>' self.assertEqual(child_length, 0) def _is_tag(self, tag, tag_name): self.assertEqual(tag.name, tag_name) def _contains_tag(self, tag, tag_name, count=1): child_length = len(tag.findAll(tag_name)) self.assertEqual(child_length, count)
def setUp(self): with open('tests/link_italic.html') as f: html_string = f.read() self.parser = CopyDoc(html_string, TOKENS) self.body = self.parser.soup.body