Exemplo n.º 1
0
def _build_sample(sample):
    from slybot.plugins.scrapely_annotations.builder import Annotations
    data = sample.get('plugins', {}).get('annotations-plugin')
    if data:
        Annotations().save_extraction_data(data, sample)
    sample['page_id'] = sample.get('page_id') or sample.get('id') or ""
    return sample
Exemplo n.º 2
0
def open_sample_and_page(name):
    sample_spec = open_spec(name)
    url = sample_spec['url']
    return (HtmlPage(url=url, body=Annotations(sample_spec).apply()),
            HtmlPage(url=url, body=sample_spec['original_body']))
Exemplo n.º 3
0
def _build_sample(sample, legacy=False):
    from slybot.plugins.scrapely_annotations.builder import Annotations
    Annotations(sample, legacy=legacy).build()
    sample['annotated'] = True
    return sample
Exemplo n.º 4
0
import six

REQUIRED_FILES = {
    'setup.py', 'scrapy.cfg', 'extractors.json', 'items.json', 'project.json',
    'spiders/__init__.py', 'spiders/settings.py'
}
FILE_TEMPLATES = {
    'extractors.json': '{}',
    'items.json': '{}',
    'project.json': templates['PROJECT'],
    'scrapy.cfg': templates['SCRAPY'],
    'setup.py': templates['SETUP'],
    'spiders/__init__.py': '',
    'spiders/settings.py': templates['SETTINGS']
}
apply_annotations = Annotations().save_extraction_data


class ProjectArchiver(object):

    required_files = frozenset(REQUIRED_FILES)
    file_templates = FILE_TEMPLATES

    def __init__(self, project, version=None, required_files=None):
        if version is None:
            version = (0, 10)
        self.separator = os.path.sep
        self.version = version
        self.project = project
        if required_files is not None:
            self.required_files = required_files
Exemplo n.º 5
0
class ExtractorTest(TestCase):

    annotated = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    _target = u"""
<table>
<tr>
<th class="item-key">Gender</th>
<td >Male</td></tr>
</table>"""
    annotated2 = u"""
<table>
<tr data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;name&quot;}}">
<th class="item-key">Name</th>
<td >John</td></tr>
<span data-scrapy-annotate="{&quot;required&quot;: [], &quot;variant&quot;: 0, &quot;annotations&quot;: {&quot;content&quot;: &quot;gender&quot;}}">Male</span>
</table>"""
    _target2 = u"""
<body>
<tr>
<th class="item-key">Name</th><td>Olivia</td></tr>
<span></span>
</body>"""

    annotations = _clean_annotation_data([{
        'id': 'annotation1',
        'selector': 'td > a',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'title',
                'required': False,
                'extractors': []
            },
            2: {
                'attribute': 'content',
                'field': 'name',
                'required': False,
                'extractors': ['3']
            },
            3: {
                'attribute': 'href',
                'field': 'url',
                'required': False,
                'extractors': ['1', '2']
            }
        }
    }, {
        'id': 'annotation2',
        'selector': 'span',
        'container_id': 'parent',
        'data': {
            1: {
                'attribute': 'content',
                'field': 'price',
                'required': False,
                'extractors': ['8', '4', '5', '6']
            },
            2: {
                'attribute': 'content',
                'field': 'date',
                'required': False,
                'extractors': ['4', '7']
            }
        }
    }, {
        'id': 'parent',
        'item_container': True,
        'selector': 'body'
    }])
    target3 = u"""
    <html>
    <body>
    <tr>
        <th class="item-key">Name</th>
        <td>
            <a href="/olivia.html">Name: Olivia</a>
        </td>
    </tr><span>2016-03-17 20:25</span>
    </body></html>"""

    template = HtmlPage(url="http://www.test.com/", body=annotated)
    target = HtmlPage(url="http://www.test.com/", body=_target)
    template2 = HtmlPage(url="http://www.test.com/", body=annotated2)
    target2 = HtmlPage(url="http://www.test.com/a", body=_target2)
    sample3 = {
        'plugins': {
            'annotations-plugin': {
                'extracts': annotations
            }
        },
        'original_body': target3
    }
    template3 = HtmlPage(url="http://www.test.com/a",
                         body=Annotations(sample3).apply())
    target3 = HtmlPage(url="http://www.test.com/a", body=target3)

    def test_regex_extractor(self):
        extractor = create_regex_extractor("(\d+).*(\.\d+)")
        extracted = extractor(
            u"The price of this product is <div>45</div> </div class='small'>.50</div> pounds"
        )
        self.assertEqual(extracted, u"45.50")
        processor = TextFieldTypeProcessor()
        self.assertEqual(processor.adapt(extracted, None), u"45.50")

    def test_raw_type_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'raw',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "regular_expression": "Gender.*(<td\s*>(?:Male|Female)</td>)"
            }
        }
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'],
            [u'<td >Male</td>'])

    def test_negative_hit_w_regex(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(ibl_extractor.extract(self.target)[0], None)

    def test_text_type_w_regex(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_type_extractor(self):
        schema = {
            "fields": {
                'gender': {
                    'required': False,
                    'type': 'number',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {
            1: {
                "type_extractor": "text"
            },
            2: {
                "regular_expression": "Gender\\s+(Male|Female)"
            }
        }
        apply_extractors(descriptor, {"gender": [1, 2]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_default_type_extractor(self):
        schema = {'fields': {}}
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender\\s+(Male|Female)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Male'])

    def test_text_type_w_regex_and_no_groups(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "Gender"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target)[0][0]['gender'], [u'Gender'])

    def test_extractor_w_empty_string_extraction(self):
        schema = {
            'fields': {
                'gender': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        descriptor = create_slybot_item_descriptor(schema)
        extractors = {1: {"regular_expression": "([0-9]+)"}}
        apply_extractors(descriptor, {"gender": [1]}, extractors)

        ibl_extractor = SlybotIBLExtractor([(self.template2, {
            '#default': descriptor
        }, '0.12.0')])
        self.assertEqual(
            ibl_extractor.extract(self.target2)[0][0]['name'],
            [u'Name Olivia'])

    def test_per_annotation_extractors(self):
        schema = {
            'fields': {
                'url': {
                    'required': False,
                    'type': 'text',
                    'vary': False,
                },
                'name': {
                    'required': True,
                    'type': 'text',
                    'vary': False,
                }
            }
        }
        extractors = {
            '1': {
                'type_extractor': 'url'
            },
            '2': {
                'regular_expression': '(.*)\.html'
            },
            '3': {
                'regular_expression': 'Name: (.*)'
            },
            '4': {
                'type_extractor': 'text'
            },
            '5': {
                'type_extractor': 'price'
            },
            '6': {
                'type_extractor': 'number'
            },
            '7': {
                'type_extractor': 'date'
            },
            '8': {
                'regular_expression': '(\d+)-'
            }
        }
        descriptors = {'#default': create_slybot_item_descriptor(schema)}
        add_extractors_to_descriptors(descriptors, extractors)
        ibl_extractor = SlybotIBLExtractor([(self.template3, descriptors,
                                             '0.13.0')])
        result = {
            u'_template': '6223d000057491040e4f411cf1f0734ea802eeb6',
            'name': [u'Olivia'],
            'url': [u'http://www.test.com/olivia'],
            'title': [u'Name: Olivia'],
            'price': [u'2016'],
            'date': [datetime(2016, 3, 17, 20, 25)]
        }
        data = ibl_extractor.extract(self.target3)[0][0]
        self.assertEqual(data, result)
Exemplo n.º 6
0
                'type': 'price'
            }
        }
    }
}

sample = {
    'plugins': {
        'annotations-plugin': {
            'extracts': annotations
        }
    },
    'original_body': html
}
simple_template = HtmlPage(url="http://www.test.com/a",
                           body=Annotations(sample).apply())
target1 = base_page('\n'.join(
    item_template(idx=i, rank=1) for i in range(1, 11)))
target2 = base_page('\n'.join(
    item_template(idx=i, rank=i if i % 2 else '') for i in range(1, 11)))
target1 = HtmlPage(url="http://www.test.com/a", body=target1)
target2 = HtmlPage(url="http://www.test.com/a", body=target2)
simple_descriptors = {
    k: create_slybot_item_descriptor(v)
    for k, v in schemas.items()
}
add_extractors_to_descriptors(simple_descriptors, {})

td = TokenDict()
html_page = HtmlPage(body=open_spec('stack_overflow.html'))
extraction_page = parse_extraction_page(td, html_page)
Exemplo n.º 7
0
        lextractor = create_linkextractor_from_specs(specs)
        response = UTF8TextResponse(url='http://www.example.com/',
                                    body=csvfeed3)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 2)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[1].url, 'http://www.example.com/path2')


html = """
<a href="http://www.example.com/path">Click here</a>
"""
_PATH = dirname(__file__)
with open('%s/data/templates/daft_list.json' % _PATH) as f:
    daft_sample = json.load(f)
    daft_body = Annotations(daft_sample).apply()
    daft_sample['annotated_body'] = daft_body


class Test_HtmlLinkExtractor(TestCase):
    def test_simple(self):
        specs = {"type": "html", "value": None}
        lextractor = create_linkextractor_from_specs(specs)
        response = UTF8HtmlResponse(url='http://www.example.com/', body=html)
        links = list(lextractor.links_to_follow(response))
        self.assertEqual(len(links), 1)
        self.assertEqual(links[0].url, 'http://www.example.com/path')
        self.assertEqual(links[0].text, 'Click here')


class Test_PaginationExtractor(TestCase):
Exemplo n.º 8
0
def generate_from_samples(
    page_items,
    path='./slybot-project',
    spider_name='aile',
    min_item_fields=2,
    max_item_fields=None,
):
    """Generate a full slybot project

    Parameters
    ----------
    page_items: List[(page, items)]
         page is an HtmlPage where tagids attributes have been added
         items is List[Item]
    path : string
        Directory where to store the project
    min_item_fields: int or None
        Discard items with less fields than this number
    max_item_fields: int or None
        Discard items with more fields than this number

    Returns
    -------
    None
    """
    if not os.path.exists(path):
        os.mkdir(path)

    # project.json
    with open(os.path.join(path, 'project.json'), 'w') as project_file:
        json.dump(generate_project(), project_file, indent=4, sort_keys=True)

    # project.json
    with open(os.path.join(path, 'project.json'), 'w') as project_file:
        json.dump(generate_project(), project_file, indent=4, sort_keys=True)

    # items.json
    all_items = collections.defaultdict(dict)
    for _, items in page_items:
        for item in items:
            for field_name, field_dict in item.dict['fields'].iteritems():
                all_items[item.name][field_name] = field_dict
    with open(os.path.join(path, 'items.json'), 'w') as items_file:
        json.dump(
            {
                item_name: {
                    'fields': fields
                }
                for item_name, fields in all_items.iteritems()
            },
            items_file,
            indent=4,
            sort_keys=True)

    # extractors
    with open(os.path.join(path, 'extractors.json'), 'w') as extractors_file:
        json.dump({}, extractors_file, indent=4, sort_keys=True)

    # spiders/
    spiders_dir = os.path.join(path, 'spiders')
    if not os.path.exists(spiders_dir):
        os.mkdir(spiders_dir)
    spider_dir = os.path.join(spiders_dir, spider_name)
    if not os.path.exists(spider_dir):
        os.mkdir(spider_dir)
    templates = []
    for i, (page, items) in enumerate(page_items):
        template = generate_empty_template(page)
        annotations = []
        for j, item in enumerate(filter(item_is_tag, items)):
            if min_item_fields is not None and len(
                    item.fields) < min_item_fields:
                continue
            if max_item_fields is not None and len(
                    item.fields) > max_item_fields:
                continue
            annotations += merge_tagid_annotations(
                generate_item_annotations(item))

        annotations = merge_containers(annotations)
        template['plugins'] = {'annotations-plugin': {'extracts': annotations}}
        Annotations().save_extraction_data({'extracts': annotations}, template)
        template_name = 'template-{0}'.format(i)
        template['name'] = template['id'] = template_name
        template_path = os.path.join(spider_dir,
                                     '{0}.json'.format(template_name))
        with open(template_path, 'w') as template_file:
            json.dump(template, template_file, indent=4, sort_keys=True)
        html_path = os.path.join(spider_dir, template_name + '-annotated.html')
        with open(html_path, 'w') as template_annotated:
            template_annotated.write(
                template['annotated_body'].encode('utf-8'))
        templates.append(template_name)

    spider_path = os.path.join(spiders_dir, '{0}.json'.format(spider_name))
    with open(spider_path, 'w') as spider_file:
        json.dump(generate_spider(page.url, templates),
                  spider_file,
                  indent=4,
                  sort_keys=True)