Пример #1
0
    def setUp(self):
        self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/"
        article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html"
        with open(article_path, 'r') as article:
            self.article_html = article.read()

        self.metadoc = Metadoc(url=self.url, html=self.article_html)
Пример #2
0
 def test_invalid_t3n(self):
     metadoc = Metadoc(
         url=
         "https://t3n.de/news/remote-work-home-office-heimarbeit-erfahrungsbericht-1018248/",
         html=None)
     result = metadoc.query()
     assert result[
         "title"] == "Remote Workers Life: „Das Homeoffice löst viele Probleme, schafft aber auch neue“"
Пример #3
0
def full_article():
    """GET data url required"""
    response.content_type = 'application/json'
    url = request.query.getone("url")
    if not url:
        abort(404)

    metadoc = Metadoc(url=url)
    payload = metadoc.query()

    return json.dumps(payload)
Пример #4
0
def full_article():
    """POST data url required, html optional"""
    response.content_type = 'application/json'
    url, html = request.forms.get("url"), request.forms.get("html")

    if not url:
        abort(404)

    metadoc = Metadoc(url=url, html=html)
    payload = metadoc.query_all()

    return json.dumps(payload)
Пример #5
0
def social_article():
    """POST data url required, html optional"""
    response.content_type = 'application/json'
    url = request.forms.get("url")

    if not url:
        abort(404)

    metadoc = Metadoc(url=url)
    payload = metadoc.query(mode="social", fmt="social")

    return json.dumps(payload)
Пример #6
0
def extract_article():
    """GET data url required"""
    response.content_type = 'application/json'
    url = request.query.getone("url")
    if not url:
        abort(404)

    metadoc = Metadoc(url=url)
    metadoc._prepare()
    metadoc._query_domain()
    metadoc._query_extract()

    payload = metadoc._render()  # Preserve order
    return json.dumps(payload)
Пример #7
0
def extract_article():
    """POST data url required, html optional"""
    response.content_type = 'application/json'
    url = request.forms.get("url")

    if not url:
        abort(404)

    metadoc = Metadoc(url=url)
    metadoc._query_domain()
    metadoc._query_extract()

    payload = metadoc._render()  # Preserve order
    return json.dumps(payload)
Пример #8
0
class MetadocModuleTest(asynctest.TestCase):
    def setUp(self):
        self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/"
        article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html"
        with open(article_path, 'r') as article:
            self.article_html = article.read()

        self.metadoc = Metadoc(url=self.url, html=self.article_html)

    @asynctest.ignore_loop
    def test_init(self):
        assert self.metadoc.url == self.url
        assert self.metadoc.html == self.article_html

    @asynctest.ignore_loop
    def test_query_all(self):
        self.metadoc.query_all()
        result = self.metadoc.return_ball()
        assert result

    @asynctest.ignore_loop
    def test_extract(self):
        self.metadoc.query_extract()
        assert self.metadoc.extractor

    @asynctest.ignore_loop
    async def test_social(self):
        await self.metadoc.query_social()
        assert self.metadoc.activity

    @asynctest.ignore_loop
    async def test_domain(self):
        self.metadoc.query_domain()
        assert self.metadoc.domain

    @asynctest.ignore_loop
    async def test_no_url_fail(self):
        with pytest.raises(AttributeError):
            Metadoc()

    @asynctest.ignore_loop
    async def test_invalid_url_fail(self):
        with pytest.raises(Exception):
            from metadoc import Metadoc
            foo = Metadoc(url="https://theintercept.com/404/", html=None)

    async def test_no_html(self):
        metadoc = Metadoc(url=self.url)
        metadoc.query_all()
Пример #9
0
class MetadocModuleTest(asynctest.TestCase):
    def setUp(self):
        self.url = "https://theintercept.com/2016/11/26/laura-ingraham-lifezette/"
        article_path = "tests/fixtures/theintercept.com/laura-ingraham-lifezette.html"
        with open(article_path, 'r') as article:
            self.article_html = article.read()

        self.metadoc = Metadoc(url=self.url, html=self.article_html)

    @asynctest.ignore_loop
    def test_init(self):
        assert self.metadoc.url == self.url
        assert self.metadoc.html == self.article_html

    @asynctest.ignore_loop
    def test_query_all(self):
        result = self.metadoc.query()
        assert result

    @asynctest.ignore_loop
    def test_extract(self):
        self.metadoc.query("extract")
        assert self.metadoc.extractor

    @asynctest.ignore_loop
    def test_social(self):
        self.metadoc.query("social")
        assert self.metadoc.activity

    @asynctest.ignore_loop
    def test_social_return(self):
        result = self.metadoc.query("social", "social")
        assert list(result.keys()) == ["url", "social", "__version__"]

    @asynctest.ignore_loop
    def test_domain(self):
        self.metadoc.query("domain")
        assert self.metadoc.domain

    @asynctest.ignore_loop
    def test_no_url_fail(self):
        with pytest.raises(AttributeError):
            Metadoc()

    @asynctest.ignore_loop
    def test_invalid_url_fail(self):
        metadoc = Metadoc(url="https://theintercept.com/404/", html=None)
        result = metadoc.query()
        assert result["errors"][
            0] == "Requesting article body failed with 404 status code."

    @asynctest.ignore_loop
    def test_no_html(self):
        metadoc = Metadoc(url=self.url)
        metadoc.query()

    @asynctest.ignore_loop
    def test_check_result(self):
        self.metadoc._check_result({})

    @asynctest.ignore_loop
    def test_invalid_charset_check(self):
        s = "Von da an beginnt fär die meisten jedoch der hektische Teil."
        assert self.metadoc._check_invalid_encoding(s) == True
        s = "Von da an beginnt für die meisten jedoch der hektische Teil."
        assert self.metadoc._check_invalid_encoding(s) == True
        s = "Von da an beginnt för die meisten jedoch der hektische Teil."
        assert self.metadoc._check_invalid_encoding(s) == True
        s = "Von da an beginnt für die meisten jedoch der hektische Teil."
        assert self.metadoc._check_invalid_encoding(s) == True

        s = "DE PÊRA"
        assert self.metadoc._check_invalid_encoding(s) == False

    @asynctest.ignore_loop
    def test_invalid_t3n(self):
        metadoc = Metadoc(
            url=
            "https://t3n.de/news/remote-work-home-office-heimarbeit-erfahrungsbericht-1018248/",
            html=None)
        result = metadoc.query()
        assert result[
            "title"] == "Remote Workers Life: „Das Homeoffice löst viele Probleme, schafft aber auch neue“"
Пример #10
0
 def test_no_html(self):
     metadoc = Metadoc(url=self.url)
     metadoc.query()
Пример #11
0
 def test_invalid_url_fail(self):
     metadoc = Metadoc(url="https://theintercept.com/404/", html=None)
     result = metadoc.query()
     assert result["errors"][
         0] == "Requesting article body failed with 404 status code."
Пример #12
0
 def test_no_url_fail(self):
     with pytest.raises(AttributeError):
         Metadoc()
Пример #13
0
 async def test_no_html(self):
     metadoc = Metadoc(url=self.url)
     metadoc.query_all()
Пример #14
0
 async def test_invalid_url_fail(self):
     with pytest.raises(Exception):
         from metadoc import Metadoc
         foo = Metadoc(url="https://theintercept.com/404/", html=None)