def test_html_parse(self): text = open("test_documents/foxnews.html", "r").read() scm = Schemato(text, url="http://foxnews.com", loglevel="ERROR") results = scm.validate() for res in results: if res.classname == "ParselyPageValidator": self.assertTrue("ttle - invalid parsely-page field" in [a.string for a in res.errors]) self.assertTrue(len(res.errors) == 1)
def test_schema_errors(self): scm = Schemato("test_documents/schema_errors.html") results = scm.validate() expected = { 'classname': 'SchemaOrgSchemaDef', 'errors': [{ 'line': '<a itemprop="copyrightNotice"', 'num': 63, 'string': 'copyrightNotice - invalid member of NewsArticle', 'level': 'Error' }, { 'line': '<meta itemprop="tickerSymbol" content="NYSE NYT"/>', 'num': 74, 'string': 'tickerSymbol - invalid member of Organization', 'level': 'Error' }, { 'line': 'itemprop="createdBy"', 'num': 47, 'string': 'createdBy - invalid member of NewsArticle', 'level': 'Error' }, { 'line': '<a itemprop="usageTerms"', 'num': 78, 'string': 'usageTerms - invalid member of NewsArticle', 'level': 'Error' }, { 'line': 'itemtype="http://schema.org/UserComment"', 'num': 111, 'string': 'UserComment - invalid class', 'level': 'Error' }, { 'line': 'itemtype="http://schema.org/UserComment"', 'num': 111, 'string': 'UserComment - invalid class', 'level': 'Error' }, { 'line': 'itemtype="http://schema.org/UserComment"', 'num': 111, 'string': 'UserComment - invalid class', 'level': 'Error' }], 'namespace': 'http://schema.org/', 'warnings': [] } for res in results: if res.classname == 'SchemaOrgSchemaDef': self.assertTrue(len(res.errors) > 0) for err in res.errors: self.assertTrue(err.string in [a['string'] for a in expected['errors']])
from distillery import ParselyDistiller, NewsDistiller from schemato import Schemato from pprint import pprint print "Loading NY Daily News..." nydailynews = Schemato( "http://www.nydailynews.com/news/politics/obama-fights-back-2nd-debate-romney-article-1.1185271" ) print "Done." print "Loading Mashable..." mashable = Schemato("http://mashable.com/2012/10/17/iphone-5-supply-problems/") print "Done." def demo(desc, class_, site): print desc print "=" * len(desc) d = class_(site) d.distill() pprint({"distilled": d.distilled, "sources": d.sources}) demo("Parse.ly strategy on Mashable", ParselyDistiller, mashable) demo("News strategy on Mashable", NewsDistiller, mashable) demo("Parse.ly strategy on NY Daily News", ParselyDistiller, nydailynews) demo("News strategy on NY Daily News", NewsDistiller, nydailynews)
def test_schema_no_errors(self): scm = Schemato("test_documents/schema.html") results = scm.validate() for res in results: self.assertTrue(len(res) == 0)
def assert_no_validation_errors(self, doc): sc = Schemato(doc) res = sc.validate() assert all([len(a.warnings) == 0 and len(a.errors) == 0 for a in res])