def test_html_to_markdown(self): parser = CurrentsArticleParser() test_html = '<b class="boldest">' + \ 'THIS IS TEST HTML and' + \ 'SOME MORE TEST HTML' + \ '<p>Back to the <a rel="index">' + 'INSIDE A LINK' + \ '</a></p></b>', "html.parser" try: parser.html_to_markdown(test_html) except HTTPError: print "Could not convert HTML to markdown" self.fail()
def test_get_soup_from_url(self): """ tests that a bs4.BeautifulSoup.Soup object is created from a url :return: """ parser = CurrentsArticleParser() try: parser.get_soup_from_url('http://google.com') except HTTPError: print "Soup was not created due to HTTPError" self.fail() else: print 'Soup was created'
def test_zap_tag_contents(self): """ Tests the conversion of Unicode and windows cp1252 characters to ascii equivalents within a bs4.BeautifulSoup.Tag object :return: """ parser = CurrentsArticleParser() soup = BeautifulSoup('<b class="boldest">' + u'\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C' + u'\x8E\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F' + '<p>Back to the <a rel="index">' + Utils.get_random_unicode(10) + '</a></p></b>', "html.parser") tag = soup.b parser.zap_tag_contents(tag) self.zap_tag_contents_test_helper(tag)
def test_get_url_slug(self): parser = CurrentsArticleParser() test_url_a = 'http://www.example.com:8080/sources/test' test_url_a_slug = 'test' test_url_b = 'http://hello.net/slug.pdf' test_url_b_slug = 'slug' if test_url_a_slug != parser.get_url_slug(test_url_a): print "slug A did not match" self.fail() if test_url_b_slug != parser.get_url_slug(test_url_b): print "slug B did not match" self.fail()
def test_zap_tag_contents(self): """ Tests the conversion of Unicode and windows cp1252 characters to ascii equivalents within a bs4.BeautifulSoup.Tag object :return: """ parser = CurrentsArticleParser() soup = BeautifulSoup( '<b class="boldest">' + u'\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C' + u'\x8E\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F' + '<p>Back to the <a rel="index">' + Utils.get_random_unicode(10) + '</a></p></b>', "html.parser") tag = soup.b parser.zap_tag_contents(tag) self.zap_tag_contents_test_helper(tag)
from currentsArticleParser import CurrentsArticleParser import requests parser = CurrentsArticleParser() # parser.scrape_url('http://currents.ucsc.edu/06-07/03-19/faults.asp') article_url_list = [] try: fi = open('filenames.txt', "r") for article_url in fi: article_url = article_url.rstrip() article_url_list.append(article_url) fi.close() except IOError: print "Error: File does not appear to exist." # article_url_list = ['http://www1.ucsc.edu/currents/03-04/12-08/CURRENTS ONLINE/03-04/11-10/firefighters.html',] parser.run_parser(article_url_list) # parser.temp_driver('http://www1.ucsc.edu/currents/00-01/01-01/coastal.html') # parser.temp_driver('http://www1.ucsc.edu/currents/03-04/12-08/CURRENTS%20ONLINE/03-04/11-17/activism.html')