def extract_value(self, resource_info): plain_text = safe_unicode(self._get_plain_text(resource_info)) title = safe_unicode(self._get_title(resource_info)) snippet_text = plain_text # strip title at start of plain text if title is not None and snippet_text.startswith(title): snippet_text = snippet_text.lstrip(title) return safe_unicode(snippet_text)
def extract_value(self, resource_info): value = self.value if isinstance(value, str): value = safe_unicode(value) if self.field.multivalued: value = [safe_unicode(v) for v in self.value] return value
def extract_value(self, resource_info): url = resource_info.url_info.get('loc') path = urlparse(url).path.rstrip('/') basename = path.split('/')[-1] if basename == '': basename = 'index-html' slug = self._make_slug(basename) return safe_unicode(slug)
def __init__(self, config, resource_info, converter): self.config = config self.resource_info = resource_info self.resource_info.metadata = converter.extract_metadata( self.resource_info) self.resource_info.text = safe_unicode( converter.extract_text(self.resource_info))
def extract_value(self, resource_info): value = resource_info.metadata.get('keywords') if value is None: raise NoValueExtracted if ',' in value: keywords = value.split(',') else: keywords = value.split() return [safe_unicode(kw.strip()) for kw in keywords]
def extract_value(self, resource_info): value = resource_info.site.attributes.get(self.key) if value is None: raise NoValueExtracted if isinstance(value, str): value = safe_unicode(value) return value
def extract_value(self, resource_info): mapped_field = self.field.config.get_field(self.field_name) field_value = mapped_field.extractor.extract_value(resource_info) if field_value is None: # Field not extracted return self._default_or_raise() if field_value in self.mapping: return safe_unicode(self.mapping[field_value]) else: # Field present but not mapped return self._default_or_raise()
def extract_value(self, resource_info): header_value = resource_info.headers.get(self.header_name) if header_value is None: # Header not present return self._default_or_raise() if self.header_name.lower() == 'content-type': header_value = get_content_type(header_value) if header_value in self.mapping: return safe_unicode(self.mapping[header_value]) else: # Header present but not mapped return self._default_or_raise()
def extract_value(self, resource_info): return safe_unicode(resource_info.url_info['loc'])
def extract_value(self, resource_info): value = resource_info.metadata.get('example') if value is None: raise NoValueExtracted return safe_unicode(value)
def _default_or_raise(self): if self.default is not None: return safe_unicode(self.default) else: raise NoValueExtracted
def extract_value(self, resource_info): if 'target' in resource_info.url_info: return safe_unicode(resource_info.url_info['target']) else: return URLExtractor().extract_value(resource_info)
def extract_value(self, resource_info): url = resource_info.url_info.get('loc') return safe_unicode(url)