def mercury_scraper(link): ''' Returns the 'content' field of the json object returned by the MercuryParser api ''' USER_AGENT = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } if not link.lower().endswith('.pdf'): access_granted = requests.get(link, headers=USER_AGENT).status_code == 200 if access_granted: parser = MercuryParser(api_key='ETkLjaGuTmB4FF0eQxWwPwUNjIeJwTDOJhKgigYA') article = parser.parse_article(link) text = article.json()['content'] # MercuryParser still returns the html tag # we use BeautifulSoup to strip those soup = BeautifulSoup(text, 'html.parser') content = soup.get_text() return content else: return "PDF file can't be accessed at this time."
def test_parse_multiple_articles_with_invalid_key(): urls = [ 'https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/', 'https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/', 'https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/' ] parser = MercuryParser('1234567890') response = parser.parse_multiple_articles(*urls) expected = { 'https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/': { 'Message': 'User is not authorized to access this resource with an explicit deny' }, 'https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/': { 'Message': 'User is not authorized to access this resource with an explicit deny' }, 'https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/': { 'Message': 'User is not authorized to access this resource with an explicit deny' } } assert response == expected
def test_parse_article(): parser = MercuryParser() response = parser.parse_article( "https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f" ) # noqa assert "Alexa, play some music" in response.json()["title"] assert response.json()["domain"] == "medium.com" assert response.status_code == 200
def test_parse_article(): parser = MercuryParser(API_KEY) response = parser.parse_article( 'https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f' ) # noqa assert 'Alexa, play some music' in response.json()['title'] assert response.json()['domain'] == 'medium.com' assert response.status_code == 200
def test_parse_multiple_articles(): urls = [ "https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/", "https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/", "https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/", ] parser = MercuryParser() response = parser.parse_multiple_articles(*urls) assert len(response.keys()) == 3
API_GOOGLE_SHORTNER = 'AIzaSyDf6meD_lupaK7uUUha3s5P6LkCG6588m4' MERCURY_WEB_PARSER = 'nGc0ya2J7z2aalFrGa8Gx3Q1o8grGFsn3cz58EJy' MY_READING_WORDS_PER_MINUTE = 235 #http://www.readingsoft.com/ bot = telegram.Bot(TOKEN_TELEGRAM) bot2 = telegram.Bot(TOKEN_TELEGRAM_2) shortener = Shortener('Google', api_key=API_GOOGLE_SHORTNER) chat_id = 31923577 url = 'https://hacker-news.firebaseio.com/v0/item/' url2 = '.json?print=pretty' telegraph = Telegraph() telegraph.createAccount("PythonTelegraphAPI") parser = MercuryParser(api_key=MERCURY_WEB_PARSER) try: update_id = bot.getUpdates()[0].update_id except IndexError: update_id = None def getTimeReadingString(words): lung = words minutes = lung / MY_READING_WORDS_PER_MINUTE if minutes == 0: return "\n" + str(lung) + " words. ~1 min." timeReading = "\n" + str(lung) + " words. ~" + str( int(minutes)) + " min, " + str(round( (minutes - int(minutes)) * 60)) + " sec"
import os import argparse import time import html2text from dotenv import load_dotenv, find_dotenv if __name__ == "__main__": load_dotenv(find_dotenv()) h = html2text.HTML2Text() h.ignore_links = True h.ignore_images = True parser = argparse.ArgumentParser() parser.add_argument('urls', help='The urls to parse.', metavar='N', nargs='+') args = parser.parse_args() mercury = MercuryParser(api_key=os.environ['MERCURY_PARSER_KEY']) for url in args.urls: print("Parsing", url, "...") content = h.handle(mercury.parse_article(url).json()['content']) with open(slugify(url) + ".txt", "wb") as f: f.write(content.encode('utf8')) time.sleep(1)
from entry import Entry from bs4 import BeautifulSoup from mercury_parser.client import MercuryParser from config import MERCURY_API_KEY parser = MercuryParser(api_key=MERCURY_API_KEY) def pdf_extractor(res): if res.headers['content-type'] == 'application/pdf': return Entry(res.url, 'web/pdf') def webpage_extractor(res): title, summary, url = None, None, res.url article = parser.parse_article(res.url) if article: data = article.json() if 'title' in data: title = data['title'] if 'excerpt' in data: summary = data['excerpt'] if 'url' in data: url = data['url'] return Entry(url, 'web', title=title, summary=summary) extractors = [pdf_extractor, webpage_extractor]
import json from urllib.parse import unquote from flask import Flask, request, render_template, url_for, redirect app = Flask(__name__) regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) from mercury_parser.client import MercuryParser parser = MercuryParser() @app.route('/') def index(): return app.send_static_file('index.html') @app.route('/parse', methods=['GET']) def parse(): url = unquote(request.args.get('url')) style = request.args.get('style') result = parser.parse_article(url).json() if style == 'dark': css = url_for('static', filename='dark.css') else: