示例#1
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     SAMPLES_FILE_PREFIX = os.path.join(path, "samples/samples_pageparsing")
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
         source = source.decode('utf-8')
         annotations = json.loads(open(fname, "rb").read().decode('utf-8'))
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
示例#2
0
 def annotations(self):
     """Return all annotations contained in the template as a list of tuples
     (annotation, index)
     """
     anlist = []
     for i, f in enumerate(self.htmlpage.parsed_body):
         if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG:
             at = f.attributes.get("data-scrapy-annotate")
             if at:
                 an = json.loads(at.replace(""", '"'))
                 anlist.append((an, i))
     return anlist
示例#3
0
 def test_site_samples(self):
     """test parse_html from real cases"""
     count = 0
     fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
     while os.path.exists(fname):
         source = open("%s_%d.html" % (SAMPLES_FILE_PREFIX, count), "rb").read()
         source = source.decode('utf-8')
         parsed = json.loads(open(fname, "rb").read().decode('utf-8'), \
                 object_hook=_decode_element)
         self._test_sample(source, parsed, count)
         count += 1
         fname = "%s_%d.json" % (SAMPLES_FILE_PREFIX, count)
示例#4
0
 def annotations(self):
     """Return all annotations contained in the template as a list of tuples
     (annotation, index)
     """
     anlist = []
     for i, f in enumerate(self.htmlpage.parsed_body):
         if isinstance(f, HtmlTag) and f.tag_type == HtmlTagType.OPEN_TAG:
             at = f.attributes.get('data-scrapy-annotate')
             if at:
                 an = json.loads(at.replace('"', '"'))
                 anlist.append((an, i))
     return anlist
 def _read_template_annotation(html_tag):
     template_attr = html_tag.attributes.get("data-scrapy-annotate")
     if template_attr is None:
         return None
     unescaped = template_attr.replace(""", '"')
     return json.loads(unescaped)
示例#6
0
 def _read_template_annotation(html_tag):
     template_attr = html_tag.attributes.get('data-scrapy-annotate')
     if template_attr is None:
         return None
     unescaped = template_attr.replace('"', '"')
     return json.loads(unescaped)