def getHTML(url): print '\nPROCESSING URL' parser = ParserClient(rk) p = parser.get_article_content(url) html = p.content['content'] #print cleanHTML(html) return cleanHTML(html)
def generate_content(url,category): parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title=article['title'] strarticle = article['content'] final_article = re.sub('<.*?>', '', strarticle) final_article2 = re.sub('&.*?;', '', final_article) line = re.sub('["]', '', final_article2) final_article3=line.encode('utf-8').strip() final_article3=os.linesep.join([s for s in final_article3.splitlines() if s]) final_article4=re.sub(' +',' ',final_article3) linet=re.sub('["]', '', str_article_title) final_article_title = linet.encode('utf-8').strip() intcategory=int(category) db = MySQLdb.connect("localhost", 'root', '', "inswipes") cursor = db.cursor() try: sql='INSERT INTO meta_content(article_content,link,main_category_id,article_title)VALUES("%s","%s","%d","%s")'%(final_article4,url,intcategory,final_article_title) cursor.execute(sql) db.commit() db.close() except: db.rollback() db.close() summarization()
def get_feed(): logging.warning('get_feed: RSS check...') parsed_feed = feedparser.parse(config.RSS_FEED) parser_client = ParserClient(readability_api_key) feed_urls_cached = Feeds.query.all() db_url_list = [cached_feed.url for cached_feed in feed_urls_cached] logging.warning('get_feed: db urls count {}'.format(len(db_url_list))) for rss_url in parsed_feed['entries']: if rss_url['link'] not in db_url_list: logging.warning('get_feed: Added from rss: {}'.format(rss_url['link'])) parser_response = parser_client.get_article_content(rss_url['link']) try: logging.warning('get_feed: Data len {}'.format(len(parser_response.content['content']))) save_to_db(rss_url['link'], parser_response.content['title'], parser_response.content['content']) add_feed = Feeds(url=rss_url['link']) db.session.add(add_feed) db.session.commit() write_action_log('rss', url=rss_url['link'], title=parser_response.content['title']) except KeyError, e: logging.warning('get_feed: ERR {}, no content'.format(e)) db.session.rollback() add_feed = Feeds(url=rss_url['link']) db.session.add(add_feed) db.session.commit() write_action_log('rss', url=rss_url['link'], title="Err parse, no title")
def extracting_content(url): parser_client = ParserClient(token='#########################') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title = article['title'] strarticle = article['content'] print(str_article_title) print(strarticle)
def get_page_metadata(url): token = os.environ.get('READABILITY_PARSER_KEY', None) if not token: return {} try: parser_client = ParserClient(token=os.environ.get('READABILITY_PARSER_KEY')) return parser_client.get_article(url).json() except Exception: logger.exception('Failed to readability for url %s', url)
def get_text(wiki_title="Jabari_Parker"): from readability import ParserClient parser_client = ParserClient(settings.PARSER_TOKEN) parser_response_text = parser_client.get_article_content(settings.WIKI_URL + wiki_title).content['content'].replace("\n", " ") ## Filter out the end of Wikipedia articles text = parser_response_text.replace("/<img[^>]*>/g","") text = text.split('<span class="mw-headline" id="See_also"')[0] text = text.split('<span class="mw-headline" id="Notes"')[0] text = text.split('<span class="mw-headline" id="References"')[0] text = text.split('<span class="mw-headline" id="Notes_and_references"')[0] return text
def extract_raw_content(a): #data = Aylien().extract(a["resolved_url"]) parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47') parser_response = parser_client.get_article_content(a['resolved_url']) try: content = parser_response.content if 'error' in content: raise Exception return content except Exception as e: print parser_response print parser_response.content print e return False
class ReadabilityToEpub: def __init__(self, parser_token=None): if not parser_token: raise Exception( "Get a Readability parser token at: https://www.readability.com/developers/api" ) self.parser_client = ParserClient(token=parser_token) def convert_url(self, url): parser_resp = self.parser_client.get_article(url).json() epub_book = epub.EpubBook() epub_book.set_title(parser_resp['title']) epub_book.add_author(parser_resp['author']) content_html = epub.EpubHtml(title=parser_resp['title'], file_name='content.xhtml', content="<h1>{}</h1>\n{}".format( parser_resp['title'], parser_resp['content'])) epub_book.add_item(content_html) epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) # A spine determines the order in which content will be shown epub_book.spine = [content_html] epub.write_epub("{}.epub".format(slugify(parser_resp['title'])), epub_book, dict(plugins=[DownloadImagesPlugin()]))
class ReadabilityToEpub: def __init__(self, parser_token=None): if not parser_token: raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api") self.parser_client = ParserClient(token=parser_token) def convert_url(self, url): parser_resp = self.parser_client.get_article(url).json() epub_book = epub.EpubBook() epub_book.set_title(parser_resp["title"]) epub_book.add_author(parser_resp["author"]) content_html = epub.EpubHtml( title=parser_resp["title"], file_name="content.xhtml", content="<h1>{}</h1>\n{}".format(parser_resp["title"], parser_resp["content"]), ) epub_book.add_item(content_html) epub_book.add_item(epub.EpubNcx()) epub_book.add_item(epub.EpubNav()) # A spine determines the order in which content will be shown epub_book.spine = [content_html] epub.write_epub( "{}.epub".format(slugify(parser_resp["title"])), epub_book, dict(plugins=[DownloadImagesPlugin()]) )
def setUp(self): self.database = mongomock.Connection().db self.parser_client = ParserClient('readability secret parser key') response = Response(dict()) response.content = dict(content='<p>article</p>') self.parser_client.get_article_content = MagicMock( return_value=response)
def get(self): client = ParserClient(token='64c0f2ae58811bc3d09104e8d22abb3e3b328971') feeds = RSSinfo.query() for feed in feeds: if feed.get_full_article == True: items = RSS.query(ancestor = feed.key) for item in items: if item.content == 'no content': parser_response = client.get_article(url = item.link) sleep(1) article = parser_response.json() item.content = article['content'] item.put() else: pass else: pass
def post(self, request, *args, **kwargs): form = LinkForm(request.POST) if form.is_valid(): link = form.save(commit=False) link.group = Group.objects.get(pk=self.kwargs['group_id']) # extract data from readability parser_client = ParserClient(token=settings.READABILITY_TOKEN) parser_response = parser_client.get_article(link.url) article = parser_response.json() link.title = article.get('title', '') link.content = article.get('content', '') link.description = article.get('excerpt', '') link.save() tags = extract_tags(link.title + ' ' + link.content) link.tags.add(*tags) url = reverse('groups:list_links', kwargs={'group_id': self.kwargs['group_id']}) return redirect(url)
def generate_content(url,category): parser_client = ParserClient(token='7c8daeedd7726bf0c7d6b042098ee320ae336d87') parser_response = parser_client.get_article(str(url)) article = parser_response.json() str_article_title=article['title'] strarticle = article['content'] final_article = re.sub('<.*?>', '', strarticle) final_article2 = re.sub('&.*?;', '', final_article) line = re.sub('["]', '', final_article2) final_article3=line.encode('utf-8').strip() final_article3=os.linesep.join([s for s in final_article3.splitlines() if s]) final_article4=re.sub(' +',' ',final_article3) linet=re.sub('["]', '', str_article_title) final_article_title = linet.encode('utf-8').strip() print(url) print(final_article4) insertion(category,url,final_article4)
def get_feed(): logging.warning('get_feed: RSS check...') parsed_feed = feedparser.parse(config.RSS_FEED) parser_client = ParserClient(readability_api_key) feed_urls_cached = Feeds.query.all() db_url_list = [cached_feed.url for cached_feed in feed_urls_cached] logging.warning('get_feed: db urls count {}'.format(len(db_url_list))) for rss_url in parsed_feed['entries']: if rss_url['link'] not in db_url_list: logging.warning('get_feed: Added from rss: {}'.format( rss_url['link'])) parser_response = parser_client.get_article_content( rss_url['link']) try: logging.warning('get_feed: Data len {}'.format( len(parser_response.content['content']))) save_to_db(rss_url['link'], parser_response.content['title'], parser_response.content['content']) add_feed = Feeds(url=rss_url['link']) db.session.add(add_feed) db.session.commit() write_action_log('rss', url=rss_url['link'], title=parser_response.content['title']) except KeyError, e: logging.warning('get_feed: ERR {}, no content'.format(e)) db.session.rollback() add_feed = Feeds(url=rss_url['link']) db.session.add(add_feed) db.session.commit() write_action_log('rss', url=rss_url['link'], title="Err parse, no title")
def get(self): """ *Get the readability parser client* **Return:** - ``parserClient`` -- the readability parser client """ self.log.info('starting the ``get`` method') from readability import ParserClient os.environ['READABILITY_PARSER_TOKEN'] = self.settings["readability"][ "parser api token"] parser_client = ParserClient() self.log.info('completed the ``get`` method') return parser_client
def _save_bookmark(bookmark_form): bookmark = { 'title': bookmark_form.title.data, 'url': bookmark_form.url.data, 'description': bookmark_form.description.data, 'referrer': bookmark_form.referrer.data, 'tags': bookmark_form.tags.data, 'published': datetime.datetime.utcnow(), 'public': bookmark_form.public.data, 'user': { '_id': ObjectId(current_user.get_id()), 'nickname': current_user.nickname, 'email': current_user.email } } if bookmark_form.archive.data: response = ParserClient(os.getenv('READABILITY_PARSER_KEY')).get_article_content(bookmark_form.url.data) if response.status == 200: bookmark['content'] = response.content['content'] mongo.db.bookmarks.update({'url': bookmark_form.url.data, 'user._id': ObjectId(current_user.get_id())}, {'$set': bookmark}, upsert=True)
writer.write("<meta name = \"SOURCEURL\" content = \" " + sourceUrl + "\" />" + '\n') writer.write(parser_response.content['content'].encode("utf8")) # get event list from GDELT project data, one csv file for each day startPath = "http://data.gdeltproject.org/events/" outPutPath = "/home/ysz/news_graph/events" articleArchive = "/home/ysz/news_graph/article" getEvents(startPath, outPutPath) print "Downloading articles......" parser_client = ParserClient( 'f25f302cab7c00da41e4f5f2c5b17428f60c97d5' ) # Crawl Tool: https://www.readability.com/developers/api/parser startTime = latest_file(articleRoot) startTime = "20150430" endTime = "20150507" files = need_extract(outPutPath, ".export.CSV.zip", startTime, endTime) # print files: for fi in files: print "filename: ", fi filehandle = open(fi, 'rb') zf = zipfile.ZipFile(filehandle) base = os.path.basename(fi) pure_file_name = os.path.splitext(base)[0] try: data = StringIO.StringIO(zf.read(pure_file_name))
# -*- coding: utf-8 -*- import nltk import string from nltk.collocations import * from nltk.stem.wordnet import WordNetLemmatizer from bs4 import BeautifulSoup from readability import ParserClient parser_client = ParserClient('0ae1d8bed72a91ed706dcf9f354a0db4b430cb47') parser_response = parser_client.get_article_content('http://www.theatlantic.com/entertainment/archive/2014/02/russias-gold-medal-figure-skaters-celeb-relationship-status-pioneers/283804') article = parser_response.content['content'] soup = BeautifulSoup(article, "lxml") text = soup.get_text() for k, v in parser_response.content.iteritems(): if k in ['title', 'dek']: text = text + v exclude = set(string.punctuation+'”'+'’') text = ''.join(ch for ch in text if ch not in exclude and ch in string.printable).lower() words = nltk.word_tokenize(text) filtered_words = [w for w in words if not w in nltk.corpus.stopwords.words('english')] for w in filtered_words: print w
def _get_article(self, url): parser_client = ParserClient(PARSER_TOKEN) return parser_client.get_article_content(url)
def get_page_content(): parser_client = ParserClient(readability_api_key) parser_response = parser_client.get_article_content(url) return parser_response
class ParserClientTest(TestCase): """Test case for the Parser Client """ def setUp(self): self.parser_client = ParserClient(PARSER_TOKEN) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain' def test_generate_url(self): """Test the clients ability to generate urls to endpoints. """ # test root resource expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('') expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN) generated_url = self.parser_client._generate_url('') self.assertEqual(generated_url, expected_url) expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser') params = {'url': 'http://www.beanis.biz/blog.html'} expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format( expected_url, PARSER_TOKEN) generated_url = self.parser_client._generate_url( 'parser', query_params=params) self.assertEqual(generated_url, expected_url) def test_get_root(self): """Test the client's ability to hit the root endpoint. """ response = self.parser_client.get_root() expected_keys = set(['resources', ]) self.assertEqual(set(response.content.keys()), expected_keys) def test_get_confidence(self): """Test the client's ability to hit the confidence endpoint. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status, 400) expected_keys = set(['url', 'confidence']) response = self.parser_client.get_confidence(url=self.test_url) self.assertEqual(response.status, 200) self.assertEqual(set(response.content.keys()), expected_keys) # confidence for wikipedia should be over .5 self.assertTrue(response.content['confidence'] > .5) def test_get_article_status(self): """Test the client's ability to hit the parser endpoint with a HEAD request. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status, 400) response = self.parser_client.get_article_status(url=self.test_url) self.assertEqual(response.status, 200) self.assertTrue(response.get('x-article-status') is not None) self.assertTrue(response.get('x-article-id') is not None) def test_get_article_content(self): """Test the client's ability to hit the parser endpoint with a GET request. """ # test with incorrect params response = self.parser_client.get_article_content() self.assertEqual(response.status, 400) response = self.parser_client.get_article_content(url=self.test_url) self.assertEqual(response.status, 200) some_expected_keys = set(['content', 'domain', 'author', 'word_count', 'title', 'total_pages']) self.assertTrue( some_expected_keys.issubset(set(response.content.keys()))) def test_post_article_content(self): """Test the client's ability to hit the parser endpoint with a POST request. """ # I'm sorry... content = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Readability v1 Parser API</title><style type="text/css"> body { font-family: sans-serif; font: 0.8em/1.4 Arial, sans-serif; margin: 2em 6em; width: 65em; } pre { font-family: Courier, monospace; font-weight: 500; font-size: 0.8em; background-color: #eef; padding: 1em; } .methods { background-color: #e4e4e4; margin-top: .4em; padding: .6em; } .methods h4 { border-bottom: 1px solid #fff; padding: .1em 0; margin-bottom: .4em; color: #0b3c97; font-size: 1.1em; } .methods h6 { color: #666; text-transform: lowercase; margin: .6em 0 .3em; } .resource { margin-bottom: 2em; margin-top: .4em; } .resource h3 { margin-bottom: .4em; font-size: 1.4em; color: #ff5700; } h1 { font-size: 2.5em; } h2 { border-bottom: 1px solid black; margin-top: 1em; color: #666; margin-bottom: 0.5em; font-size: 2em; } h3 { font-size: 1.75em; margin: 0.6em 0; } h4 { color: #666; margin: 0; padding: 0.3em 0; border-bottom: 2px solid white; } h6 { font-size: 1.1em; color: #99a; margin: 0.5em 0em 0.25em 0em; } dd { margin-left: 1em; } tt { font-size: 1.2em; } table { margin-bottom: 0.5em; width: 100%; border-collapse: collapse; } th { text-align: left; font-weight: normal; color: black; border-bottom: 1px solid black; padding: 3px 6px; } td { padding: 3px 6px; vertical-align: top; background-color: f6f6ff; font-size: 0.85em; } td p { margin: 0px; } ul { padding-left: 1.75em; } p + ul, p + ol, p + dl { margin-top: 0em; } .optional { font-weight: normal; opacity: 0.75; } </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1> <section> <h2 id="authentication">Authentication</h2> <p> Requests to the Parser API are not signed like an OAuth request. The Parser token is simply passed as a POST or GET parameter depending on the request type. Be careful not to reveal this token, requests directly to the Parser API should not be made on the client device but rather proxied to keep the API token secure. </p> </section> <section> <h2 id="quick-start">Quick Start</h2> <p class="section-intro"> Here's how to pull an article's content from the Readability Parser API: </p> <h4>Request</h4> <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre> <h4>Response</h4> <pre> HTTP/1.0 200 OK { "content" <div class=\"article-text\">\n<p>I'm idling outside Diamante's, [snip] ...</p></div>", "domain": "www.gq.com", "author": "Rafi Kohan", "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all", "short_url": "http://rdd.me/g3jcb1sr", "title": "Blowing Smoke with Boxing's Big Voice", "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&hellip;", "direction": "ltr", "word_count": 2892, "total_pages": 1, "date_published": null, "dek": "Announcer <strong>David Diamante</strong>, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars", "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg", "next_page_id": null, "rendered_pages": 1 } </pre> </section> <section> <h2 id="data-formats">Data Formats</h2> <p> All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed. </p> </section> <h3>Resources, Representations & Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4> Retrieve the base API URI - information about subresources. <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&url</span><span class="optional">&id</span><span class="optional">&max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4> Parse an article <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4> <p> Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong> </p> <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td> <p>The ID of the article within Readablity.</p> </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td> <p>The status of the content in Readability. One of:</p> <dl> <dt>INVALID</dt> <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd> <dt>UNRETRIEVED</dt> <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd> <dt>PROVIDED_BY_USER</dt> <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd> <dt>VALIDATED_BY_USERS</dt> <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd> <dt>FETCHED</dt> <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd> </dl> </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "resources": { "parser": { "description": "The Content Parser Resource", "href": "/api/content/v1/parser" } } } </pre> <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "content" <div class=\"article-text\">\n<p>I'm idling outside Diamante's, [snip] ...</p></div>", "domain": "www.gq.com", "author": "Rafi Kohan", "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all", "short_url": "http://rdd.me/g3jcb1sr", "title": "Blowing Smoke with Boxing's Big Voice", "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&hellip;", "direction": "ltr", "word_count": 2892, "total_pages": 1, "date_published": null, "dek": "Announcer <strong>David Diamante</strong>, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars", "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg", "next_page_id": null, "rendered_pages": 1 } </pre> <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "url": "http://www.gq.com/article/12", "confidence": .7 } </pre> <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> callback({ "url": "http://www.gq.com/article/12", "confidence": .7 }); </pre> <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3> The server could not understand your request. Verify that request parameters (and content, if any) are valid. <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3> <p> Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic. </p> <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p> <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3> An unknown error has occurred. <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3> The resource that you requested does not exist. </body></html> """ url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite' response = self.parser_client.post_article_content(content, url) self.assertEqual(response.status, 200) # should have gotten back content that is shorter than original self.assertTrue(len(content) > len(response.content['content']))
def setUp(self): self.parser_token = required_from_env('READABILITY_PARSER_TOKEN') self.parser_client = ParserClient(token=self.parser_token) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'
# -*- coding: utf-8 -*- import os from pymongo import MongoClient from readability import ParserClient def do_it(database, parser_client): for bookmark in list(database.bookmarks.find({}, {'_id': 1, 'url': 1})): response = parser_client.get_article_content(bookmark['url']) database.bookmarks.update({'_id': bookmark['_id']}, {'$set': {'content': response.content['content']}}, multi=True) if __name__ == '__main__': client = MongoClient(os.environ['MONGOLAB_URI']) do_it(client.get_default_database(), ParserClient(os.getenv('READABILITY_PARSER_KEY')))
def main(): if not os.path.isfile('credentials.config'): # if credentials file does not exist, start the first run function first_run() # Authenticate and generate the credentials file. # command line switches function args = read_command_args() use_evernote = args.e debug_mode = args.debug delete_files = args.t if use_evernote is True else False path = args.p info_mode = args.i if debug_mode: # print("Warning - Debug mode active. Files will be downloaded, but not added to index") logger = create_logger(log_to_console=True) logger.setLevel(logging.DEBUG) logger.info('Warning - Debug mode active. Files will be downloaded, but not added to index') elif info_mode: warnings.warn("Suppressed Resource warning", ResourceWarning) # suppresses sll unclosed socket warnings. logger = create_logger(log_to_console=True) else: warnings.warn("Suppressed Resource warning", ResourceWarning) # suppresses sll unclosed socket warnings. logger = create_logger() logger.info("\n###########\nStarting SR\n###########") try: with open('credentials.config', 'r') as json_file: credentials = json.load(json_file) # get various OAuth tokens except OSError: logger.error('Unable to open credentials file') raise SystemExit # Create the downloads folder on the specified path, or in the dir where file is stored. if path is not "": path = path[0] else: path = os.getcwd() path += "/SRDownloads" if not os.path.exists(path): os.makedirs(path) # Authenticate with Reddit logger.info('Authenticating with Reddit') client_id = credentials['reddit']['client_id'] client_secret = credentials['reddit']['client_secret'] redirect_uri = credentials['reddit']['redirect_uri'] refresh_token = credentials['reddit']['refresh_token'] user_agent = "SavedRetriever 0.9 by /u/fuzzycut" try: r = praw.Reddit(user_agent=user_agent, oauth_client_id=client_id, oauth_client_secret=client_secret, oauth_redirect_uri=redirect_uri) access_information = r.refresh_access_information(refresh_token) r.set_access_credentials(**access_information) except Exception as e: logger.error(e) raise SystemExit time_since_accesstoken = time.time() index = set() if os.path.isfile('index.txt'): # checking for index file, which contains index of downloaded files. try: with open('index.txt', 'r') as ind: for line in ind: index.add(line[:-1]) # -1 truncates the newline in the index file. except OSError: logger.error("Unable to open index file for reading") raise SystemExit if use_evernote is True: enclient = evernoteWrapper.Client(credentials['evernote']['dev_token'], 'Saved from Reddit') html_index_file = None if delete_files is False: # only create index if we're going to use it. html_index_file = html_index.index(r.get_me().name, path) try: ind = open('index.txt', 'a') # open index file for appending except OSError: logger.error("Unable to open index file for writing") raise SystemExit logger.info("Beginning to save files...") for i in r.get_me().get_saved(limit=None): if (time.time() - time_since_accesstoken) / 60 > 55: # Refresh the access token before it runs out. logger.debug('Refreshing Reddit token') r.refresh_access_information(access_information['refresh_token']) time_since_accesstoken = time.time() name = i.name file_name = name # to stop ide complaining. note = None evernote_tags = ('Reddit', 'SavedRetriever', '/r/' + i.subreddit.display_name) # add config for this later # logger.info('Saving post - {}'.format(name)) if name not in index: # file has not been downloaded permalink = i.permalink author = i.author title = i.link_title if hasattr(i, 'link_title') else i.title # ========== # # IS COMMENT # # ========== # if hasattr(i, 'body_html'): logger.debug("{} is comment".format(name)) body = i.body_html # html output body = subreddit_linker(body) output = html_output_string(permalink, author, body, title) if delete_files is False: file_name = html_writer(path, name, output) # en api section if use_evernote is True: enclient.new_note(title) enclient.add_html(output) enclient.add_tag(*evernote_tags) # the * is very important. It unpacks the tags tuple properly note = enclient.create_note() # ============ # # IS SELF-POST # # ============ # elif hasattr(i, 'is_self') and i.is_self is True: logger.debug('{} is self-post'.format(name)) text = i.selftext_html if i.selftext_html is not None else "" # html output text = subreddit_linker(text) output = html_output_string(permalink, author, text, title) if delete_files is False: file_name = html_writer(path, name, output) # en api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) enclient.add_html(output) note = enclient.create_note() # ====================== # # IS DIRECT LINKED IMAGE # # ====================== # elif hasattr(i, 'url') and re.sub("([^A-z0-9])\w+", "", i.url.split('.')[-1]) in ['jpg', 'png', 'gif', 'gifv', 'pdf']: """ Need to check file types and test pdf. How does this handle gfycat and webm? Can EN display that inline? The regex in the if is to strip out non-valid filetype chars. """ logger.debug('{} is direct linked image'.format(name)) url = i.url base_filename = "{}_image.{}".format(name, re.sub("([^A-z0-9])\w+", "", url.split('.')[ -1])) # filename for image. regex same as above. filename = path + "/" + base_filename # image downloader section if os.path.exists(filename) and (os.path.getsize(filename) > 0): # If image exists and is valid image_downloaded = True logger.info("Image already exists - {}".format(base_filename)) else: image_downloaded = image_saver(url, filename) logger.info('Downloaded image - {}'.format(base_filename)) if image_downloaded: # write image as <img> or link to local pdf downloaded in html file if filename.split('.')[-1] == 'pdf': img = '<a href="{}">Click here for link to downloaded pdf</a>'.format(base_filename) else: img = '<br><a href="{0}"><img src="{0}"></a>'.format( base_filename) # html for embedding in html file else: img = "Image failed to download - It may be temporarily or permanently unavailable" # Evernote api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) enclient.add_html(html_output_string_image(permalink, author, "", title)) # should add body="" in the function if image_downloaded: enclient.add_resource(filename) note = enclient.create_note() if delete_files is False: file_name = html_writer(path, name, html_output_string_image(permalink, author, img, title)) else: os.remove(filename) # ============== # # IS IMGUR ALBUM # # ============== # elif hasattr(i, 'url') and 'imgur' in i.url: # Add option to download images to folder. logger.debug('{} is Imgur album'.format(name)) url = i.url body = "<h2>{}</h2>".format(title) # imgur api section client = ImgurClient(credentials['imgur']['client_id'], credentials['imgur']['client_secret']) pattern = '\/([A-z0-9]{5,7})' # matches any 5-7 long word that comes after a forward slash (/). match = re.findall(pattern, url) gallery_id = match[-1].replace('/', '') # removes any forward slashes for processing gallery = [] filename = None try: gallery = client.get_album_images(gallery_id) except imgurpython.helpers.error.ImgurClientError: # if 'gallery' is actually just a lone image try: gallery = [client.get_image(gallery_id)] except imgurpython.helpers.error.ImgurClientError as error: # if gallery does not exist. Is this the best way to do this? if debug_mode is True or error.status_code != 404: print("**{} - {}**".format(error.status_code, error.error_message)) # img_path = 'Downloads/{}'.format(gallery_id) img_path = path + "/" + gallery_id if not os.path.exists(img_path): os.makedirs(img_path) for image in gallery: # add if gallery > 10, then just add a link (would be too large for the note) image_name = image.title if image.title is not None else "" image_description = image.description if image.description is not None else "" image_filetype = image.type.split('/')[1] image_id = image.id image_link = image.link # sets up downloaded filename and html for embedding image base_filename = "{}_image.{}".format(image_id, image_filetype) img = '<p><h3>{0}</h3><a href="{1}/{2}"><img src="{1}/{2}"></a><br/>{3}</p>'.format(image_name, gallery_id, base_filename, image_description) filename = img_path + "/" + base_filename if os.path.exists(filename) and (os.path.getsize(filename) > 0): # only download if file doesn't already exist logger.info('Image already exists - {}'.format(base_filename)) else: image_saver(image_link, filename) logger.info('Image downloaded - {}'.format(base_filename)) body += img # Evernote api section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) if len(gallery) == 1 and filename is not None: enclient.add_html(html_output_string_image(permalink, author, "", title)) enclient.add_resource(filename) else: enclient.add_html(html_output_string_image(permalink, author, 'This album is too large to embed; please see <a href="{}">here</a> for the original link.'.format(url), title)) note = enclient.create_note() if delete_files is False: file_name = html_writer(path, name, html_output_string_image(permalink, author, body, title)) else: shutil.rmtree(img_path) # ========== # # IS ARTICLE # # ========== # elif hasattr(i, 'title') and i.is_self is False: # This section needs work. It is semi-complete. Ultimately, adding in the full article is the goal. logger.debug('{} is article/webpage'.format(name)) url = i.url # readability api section os.environ["READABILITY_PARSER_TOKEN"] = credentials['readability'][ 'parser_key'] # set the environment variable as the parser key logger.info('Initializing Readability Client') parse = ParserClient() # readability api doesn't take the token directly parse_response = parse.get_article(url) article = parse_response.json() if 'content' not in article: # if unable to parse document, manually set an error message article['content'] = 'Unable to parse page - See <a href="{}">here</a> for the original link'.format(url) article = article['content'] article = "<a href='{}'>{}</a><br/>{}<br/>".format(url, title, article) # source of article # html output section. output = html_output_string(permalink, author, article, title) if delete_files is False: file_name = html_writer(path, name, output) # Evernote section if use_evernote is True: enclient.new_note(title) enclient.add_tag(*evernote_tags) output = html_output_string(permalink, author, article, title) enclient.add_html(output) # Add html file to note # enclient.add_resource("Downloads/{}.html".format(name)) note = enclient.create_note() # end of checking for saved items # failed_upload = False if use_evernote is True: if note is not None: # print("Saved {:9} - GUID: {}".format(name, note.guid)) logger.info('Saved {:9} - GUID: {}'.format(name, note.guid)) else: # Upload failed # print("Saved {:9} - Note failed to upload".format(name)) logger.info('Saved {:9} - Note failed to upload'.format(name)) failed_upload = True elif use_evernote is False: # print("Saved " + name) logger.info('Saved ' + name) if not debug_mode and not failed_upload: ind.write(name + "\n") ind.flush() # this fixes python not writing the file if it terminates before .close() can be called if delete_files is False: html_index_file.add_link(title, file_name, permalink) # end of for loop ind.close() logger.info("All items downloaded") if delete_files is False: html_index_file.save_and_close() else: # try remove downloads if -t is set, but don't force it if directory has things in it already. try: os.rmdir('Downloads') except OSError: logger.error("Unable to remove files")
def setUp(self): self.parser_client = ParserClient(PARSER_TOKEN) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain'
class ParserClientTest(TestCase): """Test case for the Parser Client """ def setUp(self): self.parser_client = ParserClient(PARSER_TOKEN) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain' def test_generate_url(self): """Test the clients ability to generate urls to endpoints. """ # test root resource expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('') expected_url = '{0}?token={1}'.format(expected_url, PARSER_TOKEN) generated_url = self.parser_client._generate_url('') self.assertEqual(generated_url, expected_url) expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('parser') params = {'url': 'http://www.beanis.biz/blog.html'} expected_url = '{0}?url=http%3A%2F%2Fwww.beanis.biz%2Fblog.html&token={1}'.format( expected_url, PARSER_TOKEN) generated_url = self.parser_client._generate_url('parser', query_params=params) self.assertEqual(generated_url, expected_url) def test_get_root(self): """Test the client's ability to hit the root endpoint. """ response = self.parser_client.get_root() expected_keys = set([ 'resources', ]) self.assertEqual(set(response.content.keys()), expected_keys) def test_get_confidence(self): """Test the client's ability to hit the confidence endpoint. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status, 400) expected_keys = set(['url', 'confidence']) response = self.parser_client.get_confidence(url=self.test_url) self.assertEqual(response.status, 200) self.assertEqual(set(response.content.keys()), expected_keys) # confidence for wikipedia should be over .5 self.assertTrue(response.content['confidence'] > .5) def test_get_article_status(self): """Test the client's ability to hit the parser endpoint with a HEAD request. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status, 400) response = self.parser_client.get_article_status(url=self.test_url) self.assertEqual(response.status, 200) self.assertTrue(response.get('x-article-status') is not None) self.assertTrue(response.get('x-article-id') is not None) def test_get_article_content(self): """Test the client's ability to hit the parser endpoint with a GET request. """ # test with incorrect params response = self.parser_client.get_article_content() self.assertEqual(response.status, 400) response = self.parser_client.get_article_content(url=self.test_url) self.assertEqual(response.status, 200) some_expected_keys = set([ 'content', 'domain', 'author', 'word_count', 'title', 'total_pages' ]) self.assertTrue( some_expected_keys.issubset(set(response.content.keys()))) def test_post_article_content(self): """Test the client's ability to hit the parser endpoint with a POST request. """ # I'm sorry... content = """ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> <title>Readability v1 Parser API</title><style type="text/css"> body { font-family: sans-serif; font: 0.8em/1.4 Arial, sans-serif; margin: 2em 6em; width: 65em; } pre { font-family: Courier, monospace; font-weight: 500; font-size: 0.8em; background-color: #eef; padding: 1em; } .methods { background-color: #e4e4e4; margin-top: .4em; padding: .6em; } .methods h4 { border-bottom: 1px solid #fff; padding: .1em 0; margin-bottom: .4em; color: #0b3c97; font-size: 1.1em; } .methods h6 { color: #666; text-transform: lowercase; margin: .6em 0 .3em; } .resource { margin-bottom: 2em; margin-top: .4em; } .resource h3 { margin-bottom: .4em; font-size: 1.4em; color: #ff5700; } h1 { font-size: 2.5em; } h2 { border-bottom: 1px solid black; margin-top: 1em; color: #666; margin-bottom: 0.5em; font-size: 2em; } h3 { font-size: 1.75em; margin: 0.6em 0; } h4 { color: #666; margin: 0; padding: 0.3em 0; border-bottom: 2px solid white; } h6 { font-size: 1.1em; color: #99a; margin: 0.5em 0em 0.25em 0em; } dd { margin-left: 1em; } tt { font-size: 1.2em; } table { margin-bottom: 0.5em; width: 100%; border-collapse: collapse; } th { text-align: left; font-weight: normal; color: black; border-bottom: 1px solid black; padding: 3px 6px; } td { padding: 3px 6px; vertical-align: top; background-color: f6f6ff; font-size: 0.85em; } td p { margin: 0px; } ul { padding-left: 1.75em; } p + ul, p + ol, p + dl { margin-top: 0em; } .optional { font-weight: normal; opacity: 0.75; } </style><link href="prettify/prettify.css" type="text/css" rel="stylesheet"></link><script type="text/javascript" src="prettify/prettify.js"></script></head><body onload="prettyPrint()"><h1>Readability v1 Parser API</h1> <section> <h2 id="authentication">Authentication</h2> <p> Requests to the Parser API are not signed like an OAuth request. The Parser token is simply passed as a POST or GET parameter depending on the request type. Be careful not to reveal this token, requests directly to the Parser API should not be made on the client device but rather proxied to keep the API token secure. </p> </section> <section> <h2 id="quick-start">Quick Start</h2> <p class="section-intro"> Here's how to pull an article's content from the Readability Parser API: </p> <h4>Request</h4> <pre>GET /api/content/v1/parser?url=http://blog.readability.com/2011/02/step-up-be-heard-readability-ideas/&token=1b830931777ac7c2ac954e9f0d67df437175e66e</pre> <h4>Response</h4> <pre> HTTP/1.0 200 OK { "content" <div class=\"article-text\">\n<p>I'm idling outside Diamante's, [snip] ...</p></div>", "domain": "www.gq.com", "author": "Rafi Kohan", "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all", "short_url": "http://rdd.me/g3jcb1sr", "title": "Blowing Smoke with Boxing's Big Voice", "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&hellip;", "direction": "ltr", "word_count": 2892, "total_pages": 1, "date_published": null, "dek": "Announcer <strong>David Diamante</strong>, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars", "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg", "next_page_id": null, "rendered_pages": 1 } </pre> </section> <section> <h2 id="data-formats">Data Formats</h2> <p> All requests are, by default, provided as JSON. You may also pass "?format=xml" in the URL to convert this into XML data to be consumed. </p> </section> <h3>Resources, Representations & Errors</h3><ul><li><a href="#resources">Resources</a><ul><li><a href="#idp3728">https://readability.com/api/content/v1/</a></li><li><a href="#idp4080">https://readability.com/api/content/v1/parser</a></li><li><a href="#idp39744">https://readability.com/api/content/v1/confidence</a></li></ul></li><li><a href="#representations">Representations</a><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul></li><li><a href="#faults">Errors</a><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li></ul></li></ul><h2 id="resources">Resources</h2><div class="resource"><h3 id="idp3728">/</h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp5008">GET</h4> Retrieve the base API URI - information about subresources. <h6>request header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>Authorization</strong></p></td><td><p><em><a href="" title=""></a></em><small> (required)</small></p></td><td></td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp4080">/parser?token<span class="optional">&url</span><span class="optional">&id</span><span class="optional">&max_pages</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp36384">GET</h4> Parse an article <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to return the content for.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to return the content for.</td></tr><tr><td><p><strong>max_pages</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#integer">integer</a></em></p></td><td>The maximum number of pages to parse and combine. Default is 25.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div><div class="method"><h4 id="idp63552">HEAD</h4> <p> Retrieve the Content Status of an article. This is useful if you want to save yourself from POSTing a large html document. You can do a HEAD request on the resource, and check for the status of the article in the X-Article-Status header. <strong>Additionally, if we've never seen the article before, we'll return a 404, which also means you should POST.</strong> </p> <h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>token</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td></td></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The URL of an article to check.</td></tr><tr><td><p><strong>id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The ID of an article to check.</td></tr></table><h6>response header parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>X-Article-Id</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td> <p>The ID of the article within Readablity.</p> </td></tr><tr><td><p><strong>X-Article-Status</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td> <p>The status of the content in Readability. One of:</p> <dl> <dt>INVALID</dt> <dd>We were unable to parse this URL for some reason. <em>Recommendation: Fail</em></dd> <dt>UNRETRIEVED</dt> <dd>We know of this article, but have not yet retrieved its content, or the cache has expired. <em>Recommendation: POST content to us</em></dd> <dt>PROVIDED_BY_USER</dt> <dd>We have retrieved the content for this URL from at least one user. <em>Recommendation: POST content to us</em></dd> <dt>VALIDATED_BY_USERS</dt> <dd>We have retrieved the content for this URL from multiple users, and have validated it. <em>Recommendation: GET the content from us.</em></dd> <dt>FETCHED</dt> <dd>We fetched the content for this URL manually, and it has been cached. <em>Recommendation:GET the content from us.</em></dd> </dl> </td></tr></table><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><div class="resource"><h3 id="idp39744">/confidence?url<span class="optional">&callback</span></h3><h6>Methods</h6><div class="methods"><div class="method"><h4 id="idp89296">GET</h4>Detect the confidence with which Readability could parse a given URL. Does not require a token.<h6>request query parameters</h6><table><tr><th style="width: 25%">parameter</th><th style="width: 20%">value</th><th>description</th></tr><tr><td><p><strong>url</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em><small> (required)</small></p></td><td>The URL of an article to return the confidence for.</td></tr><tr><td><p><strong>callback</strong></p></td><td><p><em><a href="http://www.w3.org/TR/xmlschema-2/#string">string</a></em></p></td><td>The jsonp callback function name.</td></tr></table><p><em>available response representations:</em></p><ul><li><a href="#https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</a></li></ul><p><em>potential faults:</em></p><ul><li><a href="#https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</a></li><li><a href="#https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</a></li></ul></div></div></div><h2 id="representations">Representations</h2><h3 id="https://readability.com/api/content/v1#rootRepresentation">Example root representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "resources": { "parser": { "description": "The Content Parser Resource", "href": "/api/content/v1/parser" } } } </pre> <h3 id="https://readability.com/api/content/v1#articleRepresentation">Example article representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "content" <div class=\"article-text\">\n<p>I'm idling outside Diamante's, [snip] ...</p></div>", "domain": "www.gq.com", "author": "Rafi Kohan", "url": "http://www.gq.com/sports/profiles/201202/david-diamante-interview-cigar-lounge-brooklyn-new-jersey-nets?currentPage=all", "short_url": "http://rdd.me/g3jcb1sr", "title": "Blowing Smoke with Boxing's Big Voice", "excerpt": "I'm idling outside Diamante's, a cigar lounge in Fort Greene, waiting for David Diamante, and soon I smell him coming. It's late January but warm. A motorcycle growls down the Brooklyn side street,&hellip;", "direction": "ltr", "word_count": 2892, "total_pages": 1, "date_published": null, "dek": "Announcer <strong>David Diamante</strong>, the new voice of the New Jersey (soon Brooklyn) Nets, has been calling boxing matches for years. On the side, he owns a cigar lounge in the heart of Brooklyn. We talk with Diamante about his new gig and the fine art of cigars", "lead_image_url": "http://www.gq.com/images/entertainment/2012/02/david-diamante/diamante-628.jpg", "next_page_id": null, "rendered_pages": 1 } </pre> <h3 id="https://readability.com/api/content/v1#confidenceRepresentation">Example confidence representation. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> { "url": "http://www.gq.com/article/12", "confidence": .7 } </pre> <h3 id="https://readability.com/api/content/v1#confidenceRepresentationJsonp">Example confidence representation as jsonp. (application/json)</h3> <pre xmlns="http://research.sun.com/wadl/2006/10" class="prettyprint"> callback({ "url": "http://www.gq.com/article/12", "confidence": .7 }); </pre> <h2 id="faults">Errors</h2><h3 id="https://readability.com/api/content/v1#error_400">400 Bad Request (application/json)</h3> The server could not understand your request. Verify that request parameters (and content, if any) are valid. <h3 id="https://readability.com/api/content/v1#error_401">401 Authorization Required (application/json)</h3> <p> Authentication failed or was not provided. Verify that you have sent valid ixDirectory credentials via HTTP Basic. </p> <p>A 'Www-Authenticate' challenge header will be sent with this type of error response.</p> <h3 id="https://readability.com/api/content/v1#error_500">500 Internal Server Error (application/json)</h3> An unknown error has occurred. <h3 id="https://readability.com/api/content/v1#error_404">404 Not Found (application/json)</h3> The resource that you requested does not exist. </body></html> """ url = 'http://readability.com/developers/api/parser#https://readability.com/api/content/v1#test_suite' response = self.parser_client.post_article_content(content, url) self.assertEqual(response.status, 200) # should have gotten back content that is shorter than original self.assertTrue(len(content) > len(response.content['content']))
#!/usr/bin/env python from readability import ParserClient import pystache import sys import re from ftfy import fix_text import codecs # readability api key parser = ParserClient('877f0069c46e0603a7d5868fab7d50731817dd9f') # thanks to http://jamesmurty.com/2011/12/30/python-code-utf8-to-latin1/ def strip (html): html = re.sub(u'[\u02bc\u2018\u2019\u201a\u201b\u2039\u203a\u300c\u300d]',"'",html) # Replace "smart" and other double-quote like things html = re.sub(u'[\u00ab\u00bb\u201c\u201d\u201e\u201f\u300e\u300f]','"', html) # Replace copyright symbol html = re.sub(u'[\u00a9\u24b8\u24d2]', '(c)', html) # Replace registered trademark symbol html = re.sub(u'[\u00ae\u24c7]', '(r)', html) # Replace sound recording copyright symbol html = re.sub(u'[\u2117\u24c5\u24df]', '(p)', html) # Replace service mark symbol html = re.sub(u'[\u2120]', '(sm)', html) # Replace trademark symbol html = re.sub(u'[\u2122]', '(tm)', html) # Replace em & em dashes html = re.sub(u'[\u2013]', '–', html) html = re.sub(u'[\u2014]', '—', html) # weird hyphen replace html = re.sub(u'[\xad]', '­', html)
def __init__(self, parser_token=None): if not parser_token: raise Exception( "Get a Readability parser token at: https://www.readability.com/developers/api" ) self.parser_client = ParserClient(token=parser_token)
class ParserClientTest(unittest.TestCase): """ Test case for the Parser Client """ def setUp(self): self.parser_token = required_from_env('READABILITY_PARSER_TOKEN') self.parser_client = ParserClient(token=self.parser_token) self.test_url = 'https://en.wikipedia.org/wiki/Mark_Twain' def test_generate_url(self): """ Test the clients ability to generate urls to endpoints. """ # Test root resource expected_url = DEFAULT_PARSER_URL_TEMPLATE.format('') expected_url = '{}?token={}'.format(expected_url, self.parser_token) generated_url = self.parser_client._generate_url('') self.assertEqual(generated_url, expected_url) # Test parser resource expected_url = '{base_url}?token={token}&url=http%3A%2F%2Fwww.google.biz%2Fblog.html'.format( base_url=DEFAULT_PARSER_URL_TEMPLATE.format('parser'), token=self.parser_token) params = {'url': 'http://www.google.biz/blog.html'} generated_url = self.parser_client._generate_url( 'parser', query_params=params) self.assertEqual(generated_url, expected_url) def test_get_root(self): """ Test the client's ability to hit the root endpoint. """ response = self.parser_client.get_root() expected_keys = set(['resources', ]) self.assertEqual(set(response.json().keys()), expected_keys) def test_get_confidence(self): """ Test the client's ability to hit the confidence endpoint. """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status_code, 400) expected_keys = set(['url', 'confidence']) response = self.parser_client.get_confidence(url=self.test_url) self.assertEqual(response.status_code, 200) self.assertEqual(set(response.json().keys()), expected_keys) # confidence for wikipedia should be over .5 self.assertTrue(response.json()['confidence'] >= .5) def test_get_article_status(self): """ Test the client's ability to hit the parser endpoint with a HEAD """ # hit without an article_id or url. Should get an error. response = self.parser_client.get_confidence() self.assertEqual(response.status_code, 400) response = self.parser_client.get_article_status(url=self.test_url) self.assertEqual(response.status_code, 200) self.assertTrue(response.headers.get('x-article-status') is not None) self.assertTrue(response.headers.get('x-article-id') is not None) def test_get_article(self): """ Test the client's ability to hit the parser endpoint with a GET """ # test with incorrect params response = self.parser_client.get_article() self.assertEqual(response.status_code, 400) response = self.parser_client.get_article(url=self.test_url) self.assertEqual(response.status_code, 200) some_expected_keys = set(['content', 'domain', 'author', 'word_count', 'title', 'total_pages']) self.assertTrue( some_expected_keys.issubset(set(response.json().keys()))) def test_post_article_content(self): """ Test the client's ability to hit the parser endpoint with a POST request. """ content = load_test_content('content/test_post_content.html') url = 'http://thisisaurlthatdoesntmatterbutmustbepassedanyway.com/article.html' response = self.parser_client.post_article_content(content, url) self.assertEqual(response.status_code, 200)
from flask import Flask, request, jsonify, make_response from werkzeug.contrib.cache import MemcachedCache #from readability.readability import Document #import requests from readability import ParserClient application = Flask(__name__) cache = MemcachedCache(['memcache:11211']) parser_client = ParserClient() @application.route("/") def hello(): id = request.args.get('id', '') url = request.args.get('url', '') result = cache.get(id) if result is None: try: parser_response = parser_client.get_article(url) result = parser_response.json() except Exception as e: print e result = { "title": "Error", "content": "<h3>Unable to fetch article's content!</h3>", } result[ 'summary'] = '<h3>You are using the out-dated Hacker News app, please update to latest version!</h3>'
# # for tag in soup.findAll('p'): # print tag import os from readability import ParserClient os.environ['READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294' # READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294' token = "c4e591e3f00ed1512c8194ab6616cf826d155294" from readability import ParserClient client = ParserClient(token=token) parser_client = ParserClient(token) parser_response = client.get_article('http://paulgraham.com/altair.html') article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article("http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681") article = parser_response.json() print(article['title']) print(article['content'])
def __init__(self, parser_token=None): if not parser_token: raise Exception("Get a Readability parser token at: https://www.readability.com/developers/api") self.parser_client = ParserClient(token=parser_token)
except: print text return text def getDomainName(domain): try: return domainToName[domain] except KeyError: return domain with open('url.csv') as csvfile: urls = csvfile.read().split('\r\n') for url in urls: article = " " try: parser_client = ParserClient('dab74f9def9312c90473befef4181cf66bab7321') parser_response = parser_client.get_article_content(url) s = parser_response.content['content'] x = parser_response.content['date_published'] title = parser_response.content['title'] author = parser_response.content['author'] article = re.sub('href', '', s) article1 = re.sub('<img.*?>', '', article) article2 = re.sub('<p>','<br>', article1) article3 = re.sub('</p>','<br />', article2) except: print 'fail', url results+=[(tounicode(url), tounicode(title), tounicode(author), tounicode(article3), timeconvert(x))] output = time.strftime('articles-%x.html').replace('/', '_') with open(output, 'w') as outputfile: for result in results:
import mmh3 import csv import os from readability import ParserClient PC = ParserClient(os.getenv('READABILITY_API_TOKEN')) STOPWORDS = [ 'a', 'able', 'about', 'across', 'after', 'all', 'almost', 'also', 'am', 'among', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 'being', 'because', 'been', 'but', 'by', 'can', 'cannot', 'could', 'dear', 'did', 'do', 'does', 'either', 'else', 'ever', 'every', 'for', 'from', 'get', 'got', 'had', 'has', 'have', 'he', 'her', 'hers', 'him', 'his', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'least', 'let', 'like', 'likely', 'may', 'me', 'might', 'most', 'must', 'my', 'neither', 'no', 'nor', 'not', 'of', 'off', 'often', 'on', 'only', 'or', 'other', 'our', 'own', 'rather', 'said', 'say', 'says', 'she', 'should', 'since', 'so', 'some', 'than', 'that', 'the', 'their', 'them', 'then', 'there', 'these', 'they', 'this', 'to', 'too', 'us', 'wants', 'was', 'we', 'were', 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'would', 'yet', 'you', 'your' ] def add_post_dicts_to_csv(list_of_post_dictionaries): #writes all important data to CSV file to be used for calculating feature vectors without hitting Readability API with open('post_dictionaries.csv', 'a') as csvfile: writer = csv.writer(csvfile, delimiter="\n") writer.writerow(list_of_post_dictionaries) def call_readability():
# # # for tag in soup.findAll('p'): # print tag import os from readability import ParserClient os.environ[ 'READABILITY_PARSER_TOKEN'] = 'c4e591e3f00ed1512c8194ab6616cf826d155294' # READABILITY_PARSER_TOKEN='c4e591e3f00ed1512c8194ab6616cf826d155294' token = "c4e591e3f00ed1512c8194ab6616cf826d155294" from readability import ParserClient client = ParserClient(token=token) parser_client = ParserClient(token) parser_response = client.get_article('http://paulgraham.com/altair.html') article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article( "http://www.politico.com/story/2016/03/rubio-wins-dc-caucuses-220681") article = parser_response.json() print(article['title']) print(article['content']) parser_response = client.get_article(