A('price', "Product price, including any discounts and tax or vat", contains_any_numbers, True), A('image_urls', "URLs for one or more images", image_url, True), A('description', "The full description of the product", html), ] ) SAMPLE_DESCRIPTOR2 = ItemDescriptor('test', 'item test', [ A('description', 'description field without tags', notags), A('price', "Product price, including any discounts and tax or vat", contains_any_numbers), ]) SAMPLE_DESCRIPTOR3 = ItemDescriptor('test', 'item test', [A('phone', 'phone number', lambda x: contains_any_numbers(x.text_content))]) SAMPLE_DESCRIPTOR4 = ItemDescriptor('test', 'item test, removes tags from description attribute', [A('description', 'description field without tags', lambda x: x.text_content)]) # A list of (test name, [templates], page, extractors, expected_result) TEST_DATA = [ # extract from a similar page ('similar page extraction', [ANNOTATED_PAGE1], EXTRACT_PAGE1, DEFAULT_DESCRIPTOR, {u'title': [u'Nice Product'], u'description': [u'wonderful product'], u'image_url': [u'nice_product.jpg']} ), # strip the first 5 characters from the title ('extractor test', [ANNOTATED_PAGE1], EXTRACT_PAGE1, ItemDescriptor('test', 'product test',
def extract(self, htmlregion): """Only matches and extracts strings with at least one number""" return contains_any_numbers(htmlregion.text_content)
def extract(self, htmlregion): return extractors.contains_any_numbers(htmlregion.text_content)