Пример #1
0
 def test_html_parse(self):
     text = open("test_documents/foxnews.html", "r").read()
     scm = Schemato(text, url="http://foxnews.com", loglevel="ERROR")
     results = scm.validate()
     for res in results:
         if res.classname == "ParselyPageValidator":
             self.assertTrue("ttle - invalid parsely-page field" in
                             [a.string for a in res.errors])
             self.assertTrue(len(res.errors) == 1)
Пример #2
0
    def test_schema_errors(self):
        scm = Schemato("test_documents/schema_errors.html")
        results = scm.validate()

        expected = {
            'classname':
            'SchemaOrgSchemaDef',
            'errors': [{
                'line': '<a  itemprop="copyrightNotice"',
                'num': 63,
                'string': 'copyrightNotice - invalid member of NewsArticle',
                'level': 'Error'
            }, {
                'line':
                '<meta itemprop="tickerSymbol" content="NYSE NYT"/>',
                'num': 74,
                'string': 'tickerSymbol - invalid member of Organization',
                'level': 'Error'
            }, {
                'line': 'itemprop="createdBy"',
                'num': 47,
                'string': 'createdBy - invalid member of NewsArticle',
                'level': 'Error'
            }, {
                'line': '<a  itemprop="usageTerms"',
                'num': 78,
                'string': 'usageTerms - invalid member of NewsArticle',
                'level': 'Error'
            }, {
                'line': 'itemtype="http://schema.org/UserComment"',
                'num': 111,
                'string': 'UserComment - invalid class',
                'level': 'Error'
            }, {
                'line': 'itemtype="http://schema.org/UserComment"',
                'num': 111,
                'string': 'UserComment - invalid class',
                'level': 'Error'
            }, {
                'line': 'itemtype="http://schema.org/UserComment"',
                'num': 111,
                'string': 'UserComment - invalid class',
                'level': 'Error'
            }],
            'namespace':
            'http://schema.org/',
            'warnings': []
        }

        for res in results:
            if res.classname == 'SchemaOrgSchemaDef':
                self.assertTrue(len(res.errors) > 0)
                for err in res.errors:
                    self.assertTrue(err.string in
                                    [a['string'] for a in expected['errors']])
Пример #3
0
from distillery import ParselyDistiller, NewsDistiller
from schemato import Schemato
from pprint import pprint
print "Loading NY Daily News..."
nydailynews = Schemato(
    "http://www.nydailynews.com/news/politics/obama-fights-back-2nd-debate-romney-article-1.1185271"
)
print "Done."
print "Loading Mashable..."
mashable = Schemato("http://mashable.com/2012/10/17/iphone-5-supply-problems/")
print "Done."


def demo(desc, class_, site):
    print desc
    print "=" * len(desc)
    d = class_(site)
    d.distill()
    pprint({"distilled": d.distilled, "sources": d.sources})


demo("Parse.ly strategy on Mashable", ParselyDistiller, mashable)
demo("News strategy on Mashable", NewsDistiller, mashable)
demo("Parse.ly strategy on NY Daily News", ParselyDistiller, nydailynews)
demo("News strategy on NY Daily News", NewsDistiller, nydailynews)
Пример #4
0
 def test_schema_no_errors(self):
     scm = Schemato("test_documents/schema.html")
     results = scm.validate()
     for res in results:
         self.assertTrue(len(res) == 0)
Пример #5
0
 def assert_no_validation_errors(self, doc):
     sc = Schemato(doc)
     res = sc.validate()
     assert all([len(a.warnings) == 0 and len(a.errors) == 0 for a in res])