def test_html_to_markdown(self):
     parser = CurrentsArticleParser()
     test_html = '<b class="boldest">' + \
                 'THIS IS TEST HTML and' + \
                 'SOME MORE TEST HTML' + \
                 '<p>Back to the <a rel="index">' + 'INSIDE A LINK' + \
                 '</a></p></b>', "html.parser"
     try:
         parser.html_to_markdown(test_html)
     except HTTPError:
         print "Could not convert HTML to markdown"
         self.fail()
 def test_html_to_markdown(self):
     parser = CurrentsArticleParser()
     test_html = '<b class="boldest">' + \
                 'THIS IS TEST HTML and' + \
                 'SOME MORE TEST HTML' + \
                 '<p>Back to the <a rel="index">' + 'INSIDE A LINK' + \
                 '</a></p></b>', "html.parser"
     try:
         parser.html_to_markdown(test_html)
     except HTTPError:
         print "Could not convert HTML to markdown"
         self.fail()
 def test_get_soup_from_url(self):
     """
     tests that a bs4.BeautifulSoup.Soup object is created from a url
     :return:
     """
     parser = CurrentsArticleParser()
     try:
         parser.get_soup_from_url('http://google.com')
     except HTTPError:
         print "Soup was not created due to HTTPError"
         self.fail()
     else:
         print 'Soup was created'
 def test_get_soup_from_url(self):
     """
     tests that a bs4.BeautifulSoup.Soup object is created from a url
     :return:
     """
     parser = CurrentsArticleParser()
     try:
         parser.get_soup_from_url('http://google.com')
     except HTTPError:
         print "Soup was not created due to HTTPError"
         self.fail()
     else:
         print 'Soup was created'
 def test_zap_tag_contents(self):
     """
     Tests the conversion of Unicode and windows cp1252 characters to
     ascii equivalents within a bs4.BeautifulSoup.Tag object
     :return:
     """
     parser = CurrentsArticleParser()
     soup = BeautifulSoup('<b class="boldest">' +
                          u'\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C' +
                          u'\x8E\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F' +
                          '<p>Back to the <a rel="index">' + Utils.get_random_unicode(10) +
                          '</a></p></b>', "html.parser")
     tag = soup.b
     parser.zap_tag_contents(tag)
     self.zap_tag_contents_test_helper(tag)
    def test_get_url_slug(self):

        parser = CurrentsArticleParser()

        test_url_a = 'http://www.example.com:8080/sources/test'
        test_url_a_slug = 'test'
        test_url_b = 'http://hello.net/slug.pdf'
        test_url_b_slug = 'slug'

        if test_url_a_slug != parser.get_url_slug(test_url_a):
            print "slug A did not match"
            self.fail()
        if test_url_b_slug != parser.get_url_slug(test_url_b):
            print "slug B did not match"
            self.fail()
    def test_get_url_slug(self):

        parser = CurrentsArticleParser()

        test_url_a = 'http://www.example.com:8080/sources/test'
        test_url_a_slug = 'test'
        test_url_b = 'http://hello.net/slug.pdf'
        test_url_b_slug = 'slug'

        if test_url_a_slug != parser.get_url_slug(test_url_a):
            print "slug A did not match"
            self.fail()
        if test_url_b_slug != parser.get_url_slug(test_url_b):
            print "slug B did not match"
            self.fail()
 def test_zap_tag_contents(self):
     """
     Tests the conversion of Unicode and windows cp1252 characters to
     ascii equivalents within a bs4.BeautifulSoup.Tag object
     :return:
     """
     parser = CurrentsArticleParser()
     soup = BeautifulSoup(
         '<b class="boldest">' +
         u'\x80\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C' +
         u'\x8E\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9E\x9F' +
         '<p>Back to the <a rel="index">' + Utils.get_random_unicode(10) +
         '</a></p></b>', "html.parser")
     tag = soup.b
     parser.zap_tag_contents(tag)
     self.zap_tag_contents_test_helper(tag)
示例#9
0
from currentsArticleParser import CurrentsArticleParser
import requests
parser = CurrentsArticleParser()

# parser.scrape_url('http://currents.ucsc.edu/06-07/03-19/faults.asp')

article_url_list = []

try:
    fi = open('filenames.txt', "r")
    for article_url in fi:
        article_url = article_url.rstrip()
        article_url_list.append(article_url)
    fi.close()
except IOError:
    print "Error: File does not appear to exist."


# article_url_list = ['http://www1.ucsc.edu/currents/03-04/12-08/CURRENTS ONLINE/03-04/11-10/firefighters.html',]

parser.run_parser(article_url_list)
# parser.temp_driver('http://www1.ucsc.edu/currents/00-01/01-01/coastal.html')
# parser.temp_driver('http://www1.ucsc.edu/currents/03-04/12-08/CURRENTS%20ONLINE/03-04/11-17/activism.html')