Пример #1
0
def mercury_scraper(link):
    '''
    Returns the 'content' field of the json object returned by the
    MercuryParser api
    '''

    USER_AGENT = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

    if not link.lower().endswith('.pdf'):

        access_granted = requests.get(link, headers=USER_AGENT).status_code == 200
        if access_granted:
            parser = MercuryParser(api_key='ETkLjaGuTmB4FF0eQxWwPwUNjIeJwTDOJhKgigYA')

            article = parser.parse_article(link)
            text = article.json()['content']

            # MercuryParser still returns the html tag
            # we use BeautifulSoup to strip those
            soup = BeautifulSoup(text, 'html.parser')
            content = soup.get_text()

            return content

    else:
        return "PDF file can't be accessed at this time."
Пример #2
0
def test_parse_multiple_articles_with_invalid_key():
    urls = [
        'https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/',
        'https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/',
        'https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/'
    ]
    parser = MercuryParser('1234567890')
    response = parser.parse_multiple_articles(*urls)
    expected = {
        'https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/':
        {
            'Message':
            'User is not authorized to access this resource with an explicit deny'
        },
        'https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/':
        {
            'Message':
            'User is not authorized to access this resource with an explicit deny'
        },
        'https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/':
        {
            'Message':
            'User is not authorized to access this resource with an explicit deny'
        }
    }
    assert response == expected
Пример #3
0
def test_parse_article():
    parser = MercuryParser()
    response = parser.parse_article(
        "https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f"
    )  # noqa
    assert "Alexa, play some music" in response.json()["title"]
    assert response.json()["domain"] == "medium.com"
    assert response.status_code == 200
Пример #4
0
def test_parse_article():
    parser = MercuryParser(API_KEY)
    response = parser.parse_article(
        'https://medium.com/swlh/alexa-play-some-music-isnt-the-only-time-amazon-is-listening-to-you-a556df19613f'
    )  # noqa
    assert 'Alexa, play some music' in response.json()['title']
    assert response.json()['domain'] == 'medium.com'
    assert response.status_code == 200
Пример #5
0
def test_parse_multiple_articles():
    urls = [
        "https://www.wired.com/2017/03/dont-blame-batteries-every-lithium-ion-explosion/",
        "https://www.wired.com/2017/03/siris-not-even-best-iphone-assistant-anymore/",
        "https://www.wired.com/2017/03/phishing-scams-fool-even-tech-nerds-heres-avoid/",
    ]
    parser = MercuryParser()
    response = parser.parse_multiple_articles(*urls)
    assert len(response.keys()) == 3
API_GOOGLE_SHORTNER = 'AIzaSyDf6meD_lupaK7uUUha3s5P6LkCG6588m4'
MERCURY_WEB_PARSER = 'nGc0ya2J7z2aalFrGa8Gx3Q1o8grGFsn3cz58EJy'

MY_READING_WORDS_PER_MINUTE = 235  #http://www.readingsoft.com/

bot = telegram.Bot(TOKEN_TELEGRAM)
bot2 = telegram.Bot(TOKEN_TELEGRAM_2)
shortener = Shortener('Google', api_key=API_GOOGLE_SHORTNER)
chat_id = 31923577

url = 'https://hacker-news.firebaseio.com/v0/item/'
url2 = '.json?print=pretty'
telegraph = Telegraph()
telegraph.createAccount("PythonTelegraphAPI")

parser = MercuryParser(api_key=MERCURY_WEB_PARSER)

try:
    update_id = bot.getUpdates()[0].update_id
except IndexError:
    update_id = None


def getTimeReadingString(words):
    lung = words
    minutes = lung / MY_READING_WORDS_PER_MINUTE
    if minutes == 0:
        return "\n" + str(lung) + " words. ~1 min."
    timeReading = "\n" + str(lung) + " words. ~" + str(
        int(minutes)) + " min, " + str(round(
            (minutes - int(minutes)) * 60)) + " sec"
Пример #7
0
import os
import argparse
import time

import html2text

from dotenv import load_dotenv, find_dotenv

if __name__ == "__main__":
    load_dotenv(find_dotenv())

    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True

    parser = argparse.ArgumentParser()
    parser.add_argument('urls',
                        help='The urls to parse.',
                        metavar='N',
                        nargs='+')
    args = parser.parse_args()

    mercury = MercuryParser(api_key=os.environ['MERCURY_PARSER_KEY'])

    for url in args.urls:
        print("Parsing", url, "...")
        content = h.handle(mercury.parse_article(url).json()['content'])
        with open(slugify(url) + ".txt", "wb") as f:
            f.write(content.encode('utf8'))
        time.sleep(1)
Пример #8
0
from entry import Entry
from bs4 import BeautifulSoup
from mercury_parser.client import MercuryParser
from config import MERCURY_API_KEY

parser = MercuryParser(api_key=MERCURY_API_KEY)


def pdf_extractor(res):
    if res.headers['content-type'] == 'application/pdf':
        return Entry(res.url, 'web/pdf')


def webpage_extractor(res):
    title, summary, url = None, None, res.url

    article = parser.parse_article(res.url)
    if article:
        data = article.json()
        if 'title' in data: title = data['title']
        if 'excerpt' in data: summary = data['excerpt']
        if 'url' in data: url = data['url']

    return Entry(url, 'web', title=title, summary=summary)


extractors = [pdf_extractor, webpage_extractor]
Пример #9
0
import json
from urllib.parse import unquote
from flask import Flask, request, render_template, url_for, redirect
app = Flask(__name__)
regex = re.compile(
    r'^(?:http|ftp)s?://'  # http:// or https://
    r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  #domain...
    r'localhost|'  #localhost...
    r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'  # ...or ip
    r'(?::\d+)?'  # optional port
    r'(?:/?|[/?]\S+)$',
    re.IGNORECASE)

from mercury_parser.client import MercuryParser

parser = MercuryParser()


@app.route('/')
def index():
    return app.send_static_file('index.html')


@app.route('/parse', methods=['GET'])
def parse():
    url = unquote(request.args.get('url'))
    style = request.args.get('style')
    result = parser.parse_article(url).json()
    if style == 'dark':
        css = url_for('static', filename='dark.css')
    else: