Пример #1
0
    def extract_value(self, resource_info):
        plain_text = safe_unicode(self._get_plain_text(resource_info))
        title = safe_unicode(self._get_title(resource_info))

        snippet_text = plain_text
        # strip title at start of plain text
        if title is not None and snippet_text.startswith(title):
            snippet_text = snippet_text.lstrip(title)
        return safe_unicode(snippet_text)
Пример #2
0
    def extract_value(self, resource_info):
        value = self.value

        if isinstance(value, str):
            value = safe_unicode(value)

        if self.field.multivalued:
            value = [safe_unicode(v) for v in self.value]

        return value
Пример #3
0
 def extract_value(self, resource_info):
     url = resource_info.url_info.get('loc')
     path = urlparse(url).path.rstrip('/')
     basename = path.split('/')[-1]
     if basename == '':
         basename = 'index-html'
     slug = self._make_slug(basename)
     return safe_unicode(slug)
Пример #4
0
    def __init__(self, config, resource_info, converter):
        self.config = config
        self.resource_info = resource_info

        self.resource_info.metadata = converter.extract_metadata(
            self.resource_info)

        self.resource_info.text = safe_unicode(
            converter.extract_text(self.resource_info))
Пример #5
0
 def extract_value(self, resource_info):
     value = resource_info.metadata.get('keywords')
     if value is None:
         raise NoValueExtracted
     if ',' in value:
         keywords = value.split(',')
     else:
         keywords = value.split()
     return [safe_unicode(kw.strip()) for kw in keywords]
Пример #6
0
    def extract_value(self, resource_info):
        value = resource_info.site.attributes.get(self.key)

        if value is None:
            raise NoValueExtracted

        if isinstance(value, str):
            value = safe_unicode(value)

        return value
Пример #7
0
    def extract_value(self, resource_info):
        mapped_field = self.field.config.get_field(self.field_name)
        field_value = mapped_field.extractor.extract_value(resource_info)
        if field_value is None:
            # Field not extracted
            return self._default_or_raise()

        if field_value in self.mapping:
            return safe_unicode(self.mapping[field_value])
        else:
            # Field present but not mapped
            return self._default_or_raise()
Пример #8
0
    def extract_value(self, resource_info):
        header_value = resource_info.headers.get(self.header_name)
        if header_value is None:
            # Header not present
            return self._default_or_raise()

        if self.header_name.lower() == 'content-type':
            header_value = get_content_type(header_value)

        if header_value in self.mapping:
            return safe_unicode(self.mapping[header_value])
        else:
            # Header present but not mapped
            return self._default_or_raise()
Пример #9
0
 def extract_value(self, resource_info):
     return safe_unicode(resource_info.url_info['loc'])
Пример #10
0
 def extract_value(self, resource_info):
     value = resource_info.metadata.get('example')
     if value is None:
         raise NoValueExtracted
     return safe_unicode(value)
Пример #11
0
 def extract_value(self, resource_info):
     return safe_unicode(resource_info.url_info['loc'])
Пример #12
0
 def extract_value(self, resource_info):
     value = resource_info.metadata.get('example')
     if value is None:
         raise NoValueExtracted
     return safe_unicode(value)
Пример #13
0
 def _default_or_raise(self):
     if self.default is not None:
         return safe_unicode(self.default)
     else:
         raise NoValueExtracted
Пример #14
0
 def extract_value(self, resource_info):
     if 'target' in resource_info.url_info:
         return safe_unicode(resource_info.url_info['target'])
     else:
         return URLExtractor().extract_value(resource_info)
Пример #15
0
 def extract_value(self, resource_info):
     url = resource_info.url_info.get('loc')
     return safe_unicode(url)