Python _clean_annotation_data示例，slybot.plugins.scrapely_annotations.builder._clean_annotation_data Python示例

示例#1

0

显示文件

def _open_sample_and_page(name):
    sample_spec = _open_spec(name)
    annotations = sample_spec['plugins']['annotations-plugin']['extracts']
    annotated = apply_annotations(_clean_annotation_data(annotations),
                                  sample_spec['original_body'])
    url = sample_spec['url']
    return (HtmlPage(url=url, body=annotated),
            HtmlPage(url=url, body=sample_spec['original_body']))

示例#2

0

显示文件

文件： test_multiple_item_extraction.py 项目： NamiStudio/portia

def _open_sample_and_page(name):
    sample_spec = _open_spec(name)
    annotations = sample_spec['plugins']['annotations-plugin']['extracts']
    annotated = apply_annotations(_clean_annotation_data(annotations),
                                  sample_spec['original_body'])
    url = sample_spec['url']
    return (HtmlPage(url=url, body=annotated),
            HtmlPage(url=url, body=sample_spec['original_body']))

示例#3

0

显示文件

class ExtractorTest(TestCase):

    annotated = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    _target = u"""
<table>
<tr>
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    annotated2 = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">
<th class="item-key">Name</th>
<td >John</td></tr>
<span data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">Male</span>
</table>"""
    _target2 = u"""
<body>
<tr>
<th class="item-key">Name</th><td>Olivia</td></tr>
<span></span>
</body>"""

    annotations = _clean_annotation_data([{
        'id': 'annotation',
        'selector': 'td > a',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'title',
                'required': False,
                'extractors': []
            },
            2: {
                'attribute': 'content',
                'field': 'name',
                'required': False,
                'extractors': ['3']
            },
            3: {
                'attribute': 'href',
                'field': 'url',
                'required': False,
                'extractors': ['1', '2']
            }
        }
    }, {
        'id': 'annotation',
        'selector': 'span',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'price',
                'required': False,
                'extractors': ['8', '4', '5', '6']
            },
            2: {
                'attribute': 'content',
                'field': 'date',
                'required': False,
                'extractors': ['4', '7']
            }
        }
    }, {
        'id': 'parent',
        'item_container': True,
        'selector': 'body'
    }])
    target3 = u"""
    <html>
    <body>
    <tr>
        <th class="item-key">Name</th>
        <td>
            <a href="/olivia.html">Name: Olivia</a>
        </td>
    </tr><span>2016-03-17 20:25</span>
    </body></html>"""

    template = HtmlPage(url="http://www.test.com/", body=annotated)
    target = HtmlPage(url="http://www.test.com/", body=_target)
    template2 = HtmlPage(url="http://www.test.com/", body=annotated2)
    target2 = HtmlPage(url="http://www.test.com/a", body=_target2)
    template3 = HtmlPage(url="http://www.test.com/a",
                         body=apply_annotations(annotations, target3))
    target3 = HtmlPage(url="http://www.test.com/a", body=target3)

    def test_regex_extractor(self):
        extractor = create_regex_extractor("(\d+).*(\.\d+)")
        extracted = extractor(u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds")
        self.assertEqual(extracted, u"45.50")
        processor = TextFieldTypeProcessor()
        self.assertEqual(processor.adapt(extracted, None), u"45.50")

    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'<td >Male</td>'])

    def test_negative_hit_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0], None)

    def test_text_type_w_regex(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_type_extractor(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"type_extractor": "text"},
            2: {"regular_expression": "Gender\\s+(Male|Female)"}
        }
        apply_extractors(descriptor, {"gender": [1, 2]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_default_type_extractor(self):
        schema = {
            'fields': {}
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender\\s+(Male|Female)"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_text_type_w_regex_and_no_groups(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {"regular_expression": "Gender"}
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])

    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "([0-9]+)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([
            (self.template2, {'#default': descriptor}, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target2)[0][0]['name'], [u'Name Olivia'])

    def test_per_annotation_extractors(self):
        schema = {
            'fields': {
                'url': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        extractors = {
            '1': {
                'type_extractor': 'url'
            },
            '2': {
                'regular_expression': '(.*)\.html'
            },
            '3': {
                'regular_expression': 'Name: (.*)'
            },
            '4': {
                'type_extractor': 'text'
            },
            '5': {
                'type_extractor': 'price'
            },
            '6': {
                'type_extractor': 'number'
            },
            '7': {
                'type_extractor': 'date'
            },
            '8': {
                'regular_expression': '(\d+)-'
            }
        }
        descriptors = {'#default': create_slybot_item_descriptor(schema)}
        add_extractors_to_descriptors(descriptors, extractors)
        ibl_extractor = SlybotIBLExtractor([
            (self.template3, descriptors, '0.13.0')
        ])
        result = {'name': [u'Olivia'], 'url': [u'http://www.test.com/olivia'],
                  'title': [u'Name: Olivia'], 'price': [u'2016'],
                  'date': [datetime(2016, 3, 17, 20, 25)]}
        data = ibl_extractor.extract(self.target3)[0][0]
        del data['_template']
        self.assertEqual(data, result)

示例#4

0

显示文件

文件： test_multiple_item_extraction.py 项目： ning1022/portia

        <div><p>Text {idx} Text {idx}</p><p>Text {idx} Text {idx}</p></div>
    </li>
""".format

html = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                           for i in range(1, 11)))

annotations = _clean_annotation_data([{
    'id': 'annotation1', 'selector': 'li > div > h3 > a',
    'container_id': 'repeated_parent',
    'data': {1: {'attribute': 'content', 'field': 'title', 'required': False,
                 'extractors': []},
             2: {'attribute': 'href', 'field': 'url', 'required': False,
                 'extractors': ['1', '2']}}},
    {'id': 'annotation2', 'selector': 'li > div > span',
     'container_id': 'repeated_parent',
     'data': {1: {'attribute': 'content', 'field': 'rank',
                  'required': True, 'extractors': []}}},
    {'id': 'annotation3', 'selector': 'li > div:nth-child(2)',
     'container_id': 'repeated_parent',
     'data': {1: {'attribute': 'content', 'field': 'description',
                  'required': True, 'extractors': []}}},
    {'id': 'parent', 'item_container': True, 'selector': 'ul'},
    {'id': 'repeated_parent', 'item_container': True, 'container_id': 'parent',
     'selector': 'li', 'repeated': True}])
schemas = {
    '#default': {'name': 'default_item', 'fields': {}},
    'data': {
        'name': 'data_item',
        'fields': {
            'title': {'required': False, 'vary': False, 'type': 'text'},
            'url': {'required': False, 'vary': False, 'type': 'url'},

示例#5

0

显示文件

文件： test_multiple_item_extraction.py 项目： plafl/portia

    items = json.load(f)
descriptors = {'#default': create_slybot_item_descriptor(items['default'],
                                                         'default')}
template = parse_template(td, html_page, descriptors)
unvalidated_template = parse_template(td, html_page, {})
unvalidated_template.id = u'stack_overflow_test'
basic_extractors = BasicTypeExtractor.create(template.annotations)
uncontained_annotation = basic_extractors[0]
root_container = basic_extractors[1]
child_container = basic_extractors[2]
child_annotations = basic_extractors[3:]

with open('%s/data/templates/411_list.json' % _PATH) as f:
    sample = json.load(f)
annotations = sample['plugins']['annotations-plugin']['extracts']
annotated = apply_annotations(_clean_annotation_data(annotations),
                              sample['original_body'])
sample_411 = HtmlPage(url=sample['url'], body=annotated)
page_411 = HtmlPage(url=sample['url'],
                    body=sample['original_body'])
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    sample = json.load(f)
annotations = sample['plugins']['annotations-plugin']['extracts']
annotated = apply_annotations(_clean_annotation_data(annotations),
                              sample['original_body'])
sample_daft = HtmlPage(url=sample['url'], body=annotated)
page_daft = HtmlPage(url=sample['url'],
                     body=sample['original_body'])
for annotation in annotations:
    for attribute in annotation.get('data', {}).values():
        attribute['required'] = False

示例#6

0

显示文件

        lextractor = create_linkextractor_from_specs(specs)
        response = TextResponse(url='http://www.example.com/', body=csvfeed3)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[1].url, 'http://www.example.com/path2')


html = """
<a href="http://www.example.com/path">Click here</a>
"""
_PATH = dirname(__file__)
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    daft_sample = json.load(f)
    annotations = daft_sample['plugins']['annotations-plugin']['extracts']
    daft_body = apply_annotations(_clean_annotation_data(annotations),
                                  daft_sample['original_body'])
    daft_sample['annotated_body'] = daft_body


class Test_HtmlLinkExtractor(TestCase):
    def test_simple(self):
        specs = {"type": "html", "value": None}
        lextractor = create_linkextractor_from_specs(specs)
        response = HtmlResponse(url='http://www.example.com/', body=html)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[0].text, 'Click here')

示例#7

0

显示文件

文件： test_multiple_item_extraction.py 项目： daqv/portia-dashboard

        <div><p>Text {idx} Text {idx}</p><p>Text {idx} Text {idx}</p></div>
    </li>
""".format

html = base_page('\n'.join(item_template(idx=i, rank=i if i % 2 else '')
                           for i in range(1, 11)))

annotations = _clean_annotation_data([{
    'id': 'annotation1', 'selector': 'li > div > h3 > a',
    'container_id': 'repeated_parent',
    'data': {1: {'attribute': 'content', 'field': 'title', 'required': False,
                 'extractors': []},
             2: {'attribute': 'href', 'field': 'url', 'required': False,
                 'extractors': ['1', '2']}}},
    {'id': 'annotation2', 'selector': 'li > div > span',
     'container_id': 'repeated_parent',
     'data': {1: {'attribute': 'content', 'field': 'rank',
                  'required': True, 'extractors': []}}},
    {'id': 'annotation3', 'selector': 'li > div:nth-child(2)',
     'container_id': 'repeated_parent',
     'data': {1: {'attribute': 'content', 'field': 'description',
                  'required': True, 'extractors': []}}},
    {'id': 'parent', 'item_container': True, 'selector': 'ul'},
    {'id': 'repeated_parent', 'item_container': True, 'container_id': 'parent',
     'selector': 'li', 'repeated': True}])
schemas = {
    '#default': {'name': 'default_item', 'fields': {}},
    'data': {
        'name': 'data_item',
        'fields': {
            'title': {'required': False, 'vary': False, 'type': 'text'},
            'url': {'required': False, 'vary': False, 'type': 'url'},

示例#8

0

显示文件

文件： test_linkextractors.py 项目： bowlofstew/portia

        specs = {"type": "column", "value": 1}
        lextractor = create_linkextractor_from_specs(specs)
        response = TextResponse(url='http://www.example.com/', body=csvfeed3)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[1].url, 'http://www.example.com/path2')

html = """
<a href="http://www.example.com/path">Click here</a>
"""
_PATH = dirname(__file__)
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    daft_sample = json.load(f)
    annotations = daft_sample['plugins']['annotations-plugin']['extracts']
    daft_body = apply_annotations(_clean_annotation_data(annotations),
                                  daft_sample['original_body'])
    daft_sample['annotated_body'] = daft_body


class Test_HtmlLinkExtractor(TestCase):
    def test_simple(self):
        specs = {"type": "html", "value": None}
        lextractor = create_linkextractor_from_specs(specs)
        response = HtmlResponse(url='http://www.example.com/', body=html)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[0].text, 'Click here')