Exemplo n.º 1
0
class TestDiffbotTransformer(unittest.TestCase):
    def setUp(self):
        self.url = 'http://httpbin.org/'
        self.json_result = r"""{"tags":["Military service","Hypertext Transfer Protocol","Windows service"],"summary":"Testing an HTTP Library can become difficult sometimes.  PostBin.org is fantastic for testing POST requests, but not much else.  This exists to cover all kinds of HTTP scenarios.  Additional endpoints are being considered (e.g.  \/deflate). {\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}","text":"Freely hosted in HTTP, HTTPS & EU flavors.\n\n\nTesting an HTTP Library can become difficult sometimes. PostBin.org is fantastic for testing POST requests, but not much else. This exists to cover all kinds of HTTP scenarios. Additional endpoints are being considered (e.g. \/deflate).\nAll endpoint responses are JSON-encoded.\nEXAMPLES\n$ curl http:\/\/httpbin.org\/ip\n{\"origin\": \"24.127.96.129\"}\n$ curl http:\/\/httpbin.org\/user-agent\n{\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}\n$ curl http:\/\/httpbin.org\/get\n{ \"args\": {}, \"headers\": { \"Accept\": \"*\/*\", \"Connection\": \"close\", \"Content-Length\": \"\", \"Content-Type\": \"\", \"Host\": \"httpbin.org\", \"User-Agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\" }, \"origin\": \"24.127.96.129\", \"url\": \"http:\/\/httpbin.org\/get\" }\n$ curl -I http:\/\/httpbin.org\/status\/418\nHTTP\/1.1 418 I'M A TEAPOT Server: nginx\/0.7.67 Date: Mon, 13 Jun 2011 04:25:38 GMT Connection: close x-more-info: http:\/\/tools.ietf.org\/html\/rfc2324 Content-Length: 135\nAUTHOR\nA Kenneth Reitz Project.\nSEE ALSO\nhttp:\/\/python-requests.org","title":"httpbin(1): HTTP Client Testing Service","type":"article","url":"http:\/\/httpbin.org\/","xpath":"\/HTML[1]\/BODY[1]\/DIV[1]"}"""
        self.json_html_result = r"""{"tags":["Military service","Hypertext Transfer Protocol","Windows service"],"summary":"Testing an HTTP Library can become difficult sometimes.  PostBin.org is fantastic for testing POST requests, but not much else.  This exists to cover all kinds of HTTP scenarios.  Additional endpoints are being considered (e.g.  \/deflate). {\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}","text":"Freely hosted in HTTP, HTTPS & EU flavors.\n\n\nTesting an HTTP Library can become difficult sometimes. PostBin.org is fantastic for testing POST requests, but not much else. This exists to cover all kinds of HTTP scenarios. Additional endpoints are being considered (e.g. \/deflate).\nAll endpoint responses are JSON-encoded.\nEXAMPLES\n$ curl http:\/\/httpbin.org\/ip\n{\"origin\": \"24.127.96.129\"}\n$ curl http:\/\/httpbin.org\/user-agent\n{\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}\n$ curl http:\/\/httpbin.org\/get\n{ \"args\": {}, \"headers\": { \"Accept\": \"*\/*\", \"Connection\": \"close\", \"Content-Length\": \"\", \"Content-Type\": \"\", \"Host\": \"httpbin.org\", \"User-Agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\" }, \"origin\": \"24.127.96.129\", \"url\": \"http:\/\/httpbin.org\/get\" }\n$ curl -I http:\/\/httpbin.org\/status\/418\nHTTP\/1.1 418 I'M A TEAPOT Server: nginx\/0.7.67 Date: Mon, 13 Jun 2011 04:25:38 GMT Connection: close x-more-info: http:\/\/tools.ietf.org\/html\/rfc2324 Content-Length: 135\nAUTHOR\nA Kenneth Reitz Project.\nSEE ALSO\nhttp:\/\/python-requests.org","title":"httpbin(1): HTTP Client Testing Service","html":"<div><p>Freely hosted in <a href=\"http:\/\/httpbin.org\">HTTP<\/a>,\n<a href=\"https:\/\/httpbin.org\">HTTPS<\/a> &amp;\n<a href=\"http:\/\/eu.httpbin.org\">EU<\/a>\nflavors.<\/p><h2 id=\"ENDPOINTS\">ENDPOINTS<\/h2><h2 id=\"DESCRIPTION\">DESCRIPTION<\/h2><p>Testing an HTTP Library can become difficult sometimes. PostBin.org is fantastic\nfor testing POST requests, but not much else. This exists to cover all kinds of HTTP\nscenarios. Additional endpoints are being considered (e.g. <code>\/deflate<\/code>).<\/p><p>All endpoint responses are JSON-encoded.<\/p><h2 id=\"EXAMPLES\">EXAMPLES<\/h2><h3 id=\"-curl-http-httpbin-org-ip\">$ curl http:\/\/httpbin.org\/ip<\/h3><pre>&lt;code&gt;{&quot;origin&quot;: &quot;24.127.96.129&quot;}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-http-httpbin-org-user-agent\">$ curl http:\/\/httpbin.org\/user-agent<\/h3><pre>&lt;code&gt;{&quot;user-agent&quot;: &quot;curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3&quot;}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-http-httpbin-org-get\">$ curl http:\/\/httpbin.org\/get<\/h3><pre>&lt;code&gt;{\n   &quot;args&quot;: {},\n   &quot;headers&quot;: {\n      &quot;Accept&quot;: &quot;*\/*&quot;,\n      &quot;Connection&quot;: &quot;close&quot;,\n      &quot;Content-Length&quot;: &quot;&quot;,\n      &quot;Content-Type&quot;: &quot;&quot;,\n      &quot;Host&quot;: &quot;httpbin.org&quot;,\n      &quot;User-Agent&quot;: &quot;curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3&quot;\n   },\n   &quot;origin&quot;: &quot;24.127.96.129&quot;,\n   &quot;url&quot;: &quot;http:\/\/httpbin.org\/get&quot;\n}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-I-http-httpbin-org-status-418\">$ curl -I http:\/\/httpbin.org\/status\/418<\/h3><pre>&lt;code&gt;HTTP\/1.1 418 I'M A TEAPOT\nServer: nginx\/0.7.67\nDate: Mon, 13 Jun 2011 04:25:38 GMT\nConnection: close\nx-more-info: http:\/\/tools.ietf.org\/html\/rfc2324\nContent-Length: 135\n&lt;\/code&gt;<\/pre><h2 id=\"AUTHOR\">AUTHOR<\/h2><p>A <a href=\"http:\/\/kennethreitz.com\/pages\/open-projects.html\">Kenneth Reitz<\/a>\nProject.<\/p><h2 id=\"SEE-ALSO\">SEE ALSO<\/h2><p><a data-bare-link=\"true\" href=\"http:\/\/python-requests.org\">http:\/\/python-requests.org<\/a><\/p><\/div>","type":"article","url":"http:\/\/httpbin.org\/","xpath":"\/HTML[1]\/BODY[1]\/DIV[1]"}"""
        self.diffbot = DiffbotTransformer(DIFFBOT_TOKEN)

    def test_connection(self):
        with mock.patch('requests.get') as requests_get:
            requests_get.return_value.content = self.json_result
            json_result = self.diffbot.extract(self.url)

            requests_get.assert_called_with(
                'https://www.diffbot.com/api/article?token={}&url=http://httpbin.org/&tags&summary'.format(DIFFBOT_TOKEN))

    def test_extract_body(self):
        #FIXME makes actual connection
        json_result = self.diffbot.extract(self.url)
        self.assertEquals(json_result, self.json_result)

    def test_html_connection(self):
        with mock.patch('requests.get') as requests_get:
            requests_get.return_value.content = self.json_html_result
            json_result = self.diffbot.extract(self.url, html=True)

            requests_get.assert_called_with(
                'https://www.diffbot.com/api/article?token={}&url=http://httpbin.org/&html&tags&summary'.format(DIFFBOT_TOKEN))

    def test_extract_body_html(self):
        #FIXME makes actual connection
        json_result = self.diffbot.extract(self.url, html=True)
        self.assertEquals(json_result, self.json_html_result)

    def test_malformed_json(self):
        #FIXME doesn't do much now
        #FIXME should reflect the current code (need to refactor)
        url = 'http://www.mnot.net/cache_docs/'
        json_result = self.diffbot.extract(url, html=True)
        try:
            json_object = json.loads(json_result)
        except json.scanner.JSONDecodeError:
            logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url))
Exemplo n.º 2
0
 def setUp(self):
     self.url = 'http://httpbin.org/'
     self.json_result = r"""{"tags":["Military service","Hypertext Transfer Protocol","Windows service"],"summary":"Testing an HTTP Library can become difficult sometimes.  PostBin.org is fantastic for testing POST requests, but not much else.  This exists to cover all kinds of HTTP scenarios.  Additional endpoints are being considered (e.g.  \/deflate). {\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}","text":"Freely hosted in HTTP, HTTPS & EU flavors.\n\n\nTesting an HTTP Library can become difficult sometimes. PostBin.org is fantastic for testing POST requests, but not much else. This exists to cover all kinds of HTTP scenarios. Additional endpoints are being considered (e.g. \/deflate).\nAll endpoint responses are JSON-encoded.\nEXAMPLES\n$ curl http:\/\/httpbin.org\/ip\n{\"origin\": \"24.127.96.129\"}\n$ curl http:\/\/httpbin.org\/user-agent\n{\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}\n$ curl http:\/\/httpbin.org\/get\n{ \"args\": {}, \"headers\": { \"Accept\": \"*\/*\", \"Connection\": \"close\", \"Content-Length\": \"\", \"Content-Type\": \"\", \"Host\": \"httpbin.org\", \"User-Agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\" }, \"origin\": \"24.127.96.129\", \"url\": \"http:\/\/httpbin.org\/get\" }\n$ curl -I http:\/\/httpbin.org\/status\/418\nHTTP\/1.1 418 I'M A TEAPOT Server: nginx\/0.7.67 Date: Mon, 13 Jun 2011 04:25:38 GMT Connection: close x-more-info: http:\/\/tools.ietf.org\/html\/rfc2324 Content-Length: 135\nAUTHOR\nA Kenneth Reitz Project.\nSEE ALSO\nhttp:\/\/python-requests.org","title":"httpbin(1): HTTP Client Testing Service","type":"article","url":"http:\/\/httpbin.org\/","xpath":"\/HTML[1]\/BODY[1]\/DIV[1]"}"""
     self.json_html_result = r"""{"tags":["Military service","Hypertext Transfer Protocol","Windows service"],"summary":"Testing an HTTP Library can become difficult sometimes.  PostBin.org is fantastic for testing POST requests, but not much else.  This exists to cover all kinds of HTTP scenarios.  Additional endpoints are being considered (e.g.  \/deflate). {\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}","text":"Freely hosted in HTTP, HTTPS & EU flavors.\n\n\nTesting an HTTP Library can become difficult sometimes. PostBin.org is fantastic for testing POST requests, but not much else. This exists to cover all kinds of HTTP scenarios. Additional endpoints are being considered (e.g. \/deflate).\nAll endpoint responses are JSON-encoded.\nEXAMPLES\n$ curl http:\/\/httpbin.org\/ip\n{\"origin\": \"24.127.96.129\"}\n$ curl http:\/\/httpbin.org\/user-agent\n{\"user-agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\"}\n$ curl http:\/\/httpbin.org\/get\n{ \"args\": {}, \"headers\": { \"Accept\": \"*\/*\", \"Connection\": \"close\", \"Content-Length\": \"\", \"Content-Type\": \"\", \"Host\": \"httpbin.org\", \"User-Agent\": \"curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3\" }, \"origin\": \"24.127.96.129\", \"url\": \"http:\/\/httpbin.org\/get\" }\n$ curl -I http:\/\/httpbin.org\/status\/418\nHTTP\/1.1 418 I'M A TEAPOT Server: nginx\/0.7.67 Date: Mon, 13 Jun 2011 04:25:38 GMT Connection: close x-more-info: http:\/\/tools.ietf.org\/html\/rfc2324 Content-Length: 135\nAUTHOR\nA Kenneth Reitz Project.\nSEE ALSO\nhttp:\/\/python-requests.org","title":"httpbin(1): HTTP Client Testing Service","html":"<div><p>Freely hosted in <a href=\"http:\/\/httpbin.org\">HTTP<\/a>,\n<a href=\"https:\/\/httpbin.org\">HTTPS<\/a> &amp;\n<a href=\"http:\/\/eu.httpbin.org\">EU<\/a>\nflavors.<\/p><h2 id=\"ENDPOINTS\">ENDPOINTS<\/h2><h2 id=\"DESCRIPTION\">DESCRIPTION<\/h2><p>Testing an HTTP Library can become difficult sometimes. PostBin.org is fantastic\nfor testing POST requests, but not much else. This exists to cover all kinds of HTTP\nscenarios. Additional endpoints are being considered (e.g. <code>\/deflate<\/code>).<\/p><p>All endpoint responses are JSON-encoded.<\/p><h2 id=\"EXAMPLES\">EXAMPLES<\/h2><h3 id=\"-curl-http-httpbin-org-ip\">$ curl http:\/\/httpbin.org\/ip<\/h3><pre>&lt;code&gt;{&quot;origin&quot;: &quot;24.127.96.129&quot;}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-http-httpbin-org-user-agent\">$ curl http:\/\/httpbin.org\/user-agent<\/h3><pre>&lt;code&gt;{&quot;user-agent&quot;: &quot;curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3&quot;}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-http-httpbin-org-get\">$ curl http:\/\/httpbin.org\/get<\/h3><pre>&lt;code&gt;{\n   &quot;args&quot;: {},\n   &quot;headers&quot;: {\n      &quot;Accept&quot;: &quot;*\/*&quot;,\n      &quot;Connection&quot;: &quot;close&quot;,\n      &quot;Content-Length&quot;: &quot;&quot;,\n      &quot;Content-Type&quot;: &quot;&quot;,\n      &quot;Host&quot;: &quot;httpbin.org&quot;,\n      &quot;User-Agent&quot;: &quot;curl\/7.19.7 (universal-apple-darwin10.0) libcurl\/7.19.7 OpenSSL\/0.9.8l zlib\/1.2.3&quot;\n   },\n   &quot;origin&quot;: &quot;24.127.96.129&quot;,\n   &quot;url&quot;: &quot;http:\/\/httpbin.org\/get&quot;\n}\n&lt;\/code&gt;<\/pre><h3 id=\"-curl-I-http-httpbin-org-status-418\">$ curl -I http:\/\/httpbin.org\/status\/418<\/h3><pre>&lt;code&gt;HTTP\/1.1 418 I'M A TEAPOT\nServer: nginx\/0.7.67\nDate: Mon, 13 Jun 2011 04:25:38 GMT\nConnection: close\nx-more-info: http:\/\/tools.ietf.org\/html\/rfc2324\nContent-Length: 135\n&lt;\/code&gt;<\/pre><h2 id=\"AUTHOR\">AUTHOR<\/h2><p>A <a href=\"http:\/\/kennethreitz.com\/pages\/open-projects.html\">Kenneth Reitz<\/a>\nProject.<\/p><h2 id=\"SEE-ALSO\">SEE ALSO<\/h2><p><a data-bare-link=\"true\" href=\"http:\/\/python-requests.org\">http:\/\/python-requests.org<\/a><\/p><\/div>","type":"article","url":"http:\/\/httpbin.org\/","xpath":"\/HTML[1]\/BODY[1]\/DIV[1]"}"""
     self.diffbot = DiffbotTransformer(DIFFBOT_TOKEN)
Exemplo n.º 3
0
def main():
    pinboard_db = PinboardDatabase()
    datestr = pinboard_db.last_updated

    pinboard = PinboardSource(PINBOARD_API_TOKEN)
    diffbot = DiffbotTransformer(DIFFBOT_TOKEN)
    evernote = EvernoteSink(EVERNOTE_DEVELOPER_TOKEN)

    logging.info("Fetching data from {}".format(datestr))

    bookmarks = pinboard.fetch_from_date(datestr)
    # bookmarks = pinboard.fetch_from_url("http://i.imgur.com/4n92M.jpg")
    # bookmarks = pinboard.fetch_from_url("http://neoocean.net/blog/i/entry/%EB%B2%94%EC%A3%84%EC%97%90-%EB%8C%80%ED%95%9C-%ED%8B%80%EB%A6%B0-%EC%98%88%EC%B8%A1#_post_2057")
    # bookmarks = pinboard.fetch_from_url("http://nullmodel.egloos.com/3425248")
    # bookmarks = pinboard.fetch_from_url("http://www.daniel-lemire.com/blog/archives/2010/11/02/how-do-search-engines-handle-special-characters-should-you-care/")
    # bookmarks = pinboard.fetch_from_url("http://www.1011ltd.com/web/blog/post/evolving_pid")  # no content type returned

    items = []
    for bookmark in reversed(bookmarks):
        logging.info("Handling : {}".format(bookmark.url))
        try:
            resource = URLFetcher(bookmark.url)
        except requests.exceptions.ConnectionError as e:
            logging.error("Failed to fetch resource at {}".format(bookmark.url))
            logging.error("Reason: {}".format(e))
            continue
        except requests.exceptions.TooManyRedirects as e:
            logging.error("Failed to fetch resource at {}".format(bookmark.url))
            logging.error("Reason: {}".format(e))
            continue

        item = Item()
        if resource.is_PDF():
            item = PDFItem.from_pinboard_item(bookmark)
            item.content = resource.fetch()  #FIXME this could take very long. Need a way to address this problem.
        elif resource.is_image():
            item = ImageItem.from_pinboard_item(bookmark)
            item.content_type = resource.image_content_type()
            item.content = resource.fetch()
        elif resource.is_HTML() or resource.is_text():
            if resource.is_HTML():
                item = HTMLItem.from_pinboard_item(bookmark)
                json_result = diffbot.extract(item.url, html=True)
                try:
                    json_object = json.loads(json_result)
                except json.scanner.JSONDecodeError:
                    logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url))
                    continue

                if 'error' in json_object:
                    logging.error("Failed to fetch resource at {}".format(item.url))
                    logging.error(u"Reason: {}".format(json_object['error']))
                    continue

                if 'statusCode' in json_object:
                    if json_object['statusCode'] == 500:
                        logging.error("Failed to fetch resource at {}".format(item.url))
                        logging.error(u"Reason: {}".format(json_object['message']))
                        continue

                if 'html' in json_object:
                    item.content = html2enml(json_object['html'])
                else:
                    # try plaintext
                    if 'text' not in json_object:
                        logging.error("Failed to fetch HTML document at all: {}".format(item.url))
                        continue
                    logging.warn("Failed to fetch HTML document for {}".format(item.url))
                    logging.warn("Degrading to using text summary")
                    item.content = html2enml(json_object['text'])
            else:
                item = TextItem.from_pinboard_item(bookmark)
                json_result = diffbot.extract(item.url, html=True)
                try:
                    json_object = json.loads(json_result)
                except json.scanner.JSONDecodeError:
                    logging.error("Unable to decode JSON for resource at : {}".format(bookmark.url))
                    continue

                # resource is plain text
                contents = resource.fetch().split('\n\n')
                data = "<div>"
                for content in contents:
                    data += ''.join(['<div>' + body + '</div>' for body in content.split('\n')])
                    data += "<div><br /></div>"
                data += "</div>"

                item.content = html2enml(data)

            # Check for default tags
            # FIXME seemingly random criteria for checking tags
            if not item.tags or (item.tags.lower() == 'unread' and len(item.tags.split()) == 1):
                # Diffbot will not contain tags key even if explicitly told to return tags if it does not find any
                if 'tags' in json_object:
                    # autotag tells that this was autotagged.
                    # Evernote cannot handle tags with commas.
                    tags = 'autotag ' + ' '.join(('_'.join(x.replace(',','').split()) for x in json_object['tags']))  # diffbot tags
                    item.tags = tags.encode('utf-8', 'xmlcharrefreplace')

        else:
            logging.error("Unknown content-type of {}".format(resource.content_type))
            continue

        try:
            evernote.push(item)
        except socket.error as e:
            logging.error("Socket error: {}".format(e))
            continue
        except EDAMUserException as e:
            logging.error("Unrecognized evernote type: {}".format(e))
            continue

        pinboard_db.last_updated = item.time

    pinboard_db.close()