Пример #1
0
    def __init__(self, identity, text, url, harvest_details):
        self.text = text
        self.identity = identity
        self.url = url
        self.harvest_details = harvest_details

        # parse
        self.parser = Parser(text)
        self.parse()
Пример #2
0
    def process_response(self, data):
        '''
        data here is just the content from the cleaned result set

        strip punctuation (modified version)
        tokenize
        strip stopwords
        '''

        def _strip_punctuation(text, simple_pattern=r'[;|>+:=#@%<?(){}`\'"]'):
            text = re.sub(simple_pattern, ' ', text)
            return text.replace("/", ' ')

        content = data['content']

        if self.include_structure:
            # include the xml tags, etc
            # note: this uses a different punctuation set
            bow = _strip_punctuation(content)
            # so this runs without error in ipy but not here.
            # TODO: fix that
            words = tokenize(bow)
            words = remove_stopwords(words)
            return words
        else:
            # pull out the text only
            parser = Parser(content)
            all_text = parser.find_nodes()

            # collapse to just text and attributes.text values
            bow = ''
            bow += ' '.join([a.get('text' '') for a in all_text])

            atts = [a.get('attributes', []) for a in all_text]
            bow += ' '.join([a.get('text' '') for a in atts])

            bow = remove_punctuation(bow)
            words = tokenize_text(bow)
            words = remove_stopwords(words)

            if len(words) < self.minimum_wordcount:
                return ''
            return ' '.join(words)
Пример #3
0
    def create(self, doc):
        # the sha (generated from url if not in doc)
        self.source_url_sha = doc.get('url_hash')
        # TODO: verify that the pipeline output contains this
        self.source_url = doc.get('url')
        # this should come from the cleaned output of
        # the pipeline so that we don't need to keep
        # dealing with really junky strings
        # TODO: switch to pipeline clean content

        # cleaned_content = self._clean(doc.get('raw_content', ''))

        # try:
        #     parser = Parser(cleaned_content)
        #     cleaned_content = etree.tostring(parser.xml, pretty_print=True)
        #     fmt = 'xml'
        # except Exception as ex:
        #     print 'xml error', ex
        #     try:
        #         clean_json = json.loads(cleaned_content)
        #         fmt = 'json'
        #     except:
        #         fmt = 'unknown'

        fmt = doc.get('response_datatype', 'unknown')
        cleaned_content = doc.get('content')

        if fmt == 'xml':
            parser = Parser(cleaned_content.encode('utf-8'))
            try:
                self.namespaces = parser.namespaces
            except Exception as ex:
                print 'namespace error', ex

            try:
                self.schemas = self._pull_schemas(parser.xml)
            except Exception as ex:
                print 'schema error', ex
                traceback.print_exc()

        self.format = fmt
        self.cleaned_content = cleaned_content

        self.raw_content = doc.get('raw_content', '')
        self.raw_content_md5 = doc.get('digest', '')
        self.initial_harvest_date = doc.get('tstamp')
        self.host = doc.get('host', '')
        self.inlinks = doc.get('inlinks', [])
        self.outlinks = doc.get('outlinks', [])
        headers = doc.get('response_headers', [])
        self.headers = dict(
            (
                k.strip(), v.strip()) for k, v in
                    (h.split(':', 1) for h in headers)
        )
Пример #4
0
    def process_response(self, data):
        '''
        data here is just the content from the cleaned result set

        strip punctuation (modified version)
        tokenize
        strip stopwords
        '''
        def _strip_punctuation(text, simple_pattern=r'[;|>+:=#@%<?(){}`\'"]'):
            text = re.sub(simple_pattern, ' ', text)
            return text.replace("/", ' ')

        content = data['content']

        if self.include_structure:
            # include the xml tags, etc
            # note: this uses a different punctuation set
            bow = _strip_punctuation(content)
            # so this runs without error in ipy but not here.
            # TODO: fix that
            words = tokenize(bow)
            words = remove_stopwords(words)
            return words
        else:
            # pull out the text only
            parser = Parser(content)
            all_text = parser.find_nodes()

            # collapse to just text and attributes.text values
            bow = ''
            bow += ' '.join([a.get('text' '') for a in all_text])

            atts = [a.get('attributes', []) for a in all_text]
            bow += ' '.join([a.get('text' '') for a in atts])

            bow = remove_punctuation(bow)
            words = tokenize_text(bow)
            words = remove_stopwords(words)

            if len(words) < self.minimum_wordcount:
                return ''
            return ' '.join(words)
 def __init__(self, yaml_files, source_content, source_url, **options):
     '''
     **options:
         parser: Parser from source_content
         ignore_case: bool
     '''
     self.yaml_files = yaml_files
     self.source_content = source_content
     self.source_url = source_url
     self.yaml = import_yaml_configs(self.yaml_files)
     self.parser = Parser(source_content)
    def create(self, doc):
        # the sha (generated from url if not in doc)
        self.source_url_sha = doc.get('url_hash')
        # TODO: verify that the pipeline output contains this
        self.source_url = doc.get('url')
        fmt = doc.get('response_datatype', 'unknown')
        cleaned_content = doc.get('content')

        if fmt == 'xml':
            parser = Parser(cleaned_content.encode('utf-8'))

            if parser.xml is None:
                print self.source_url_sha, self.source_url
                fmt = 'xml;unparsed'

        #     try:
        #         self.namespaces = parser.namespaces
        #     except Exception as ex:
        #         print 'namespace error', ex

        #     try:
        #         self.schemas = self._pull_schemas(parser.xml)
        #     except Exception as ex:
        #         print 'schema error', ex
        #         traceback.print_exc()

        self.format = fmt
        self.cleaned_content = cleaned_content

        self.raw_content = doc.get('raw_content', '')
        self.raw_content_md5 = doc.get('digest', '')
        self.initial_harvest_date = doc.get('tstamp')
        self.host = doc.get('host', '')
        self.inlinks = doc.get('inlinks', [])
        self.outlinks = doc.get('outlinks', [])
        headers = doc.get('response_headers', [])
        self.headers = dict((k.strip(), v.strip())
                            for k, v in (h.split(':', 1) for h in headers))
        self.response_identity = next(iter(doc.get('identity', [])), {})
Пример #7
0
 def process_response(self, data):
     # do the response processing
     content = data['content'].encode('unicode_escape')
     parser = Parser(content)
     return parser.to_string()
Пример #8
0
 def _load_xml(self):
     self.parser = Parser(self._response)
Пример #9
0
 def process_response(self, data):
     # do the response processing
     content = data['content'].encode('unicode_escape')
     parser = Parser(content)
     return parser.to_string()
Пример #10
0
    def _instantiate(self, identity, response, url, harvest_details):
        '''
        set up the router
        '''
        # TODO: add a filter for known protocol with type
        #       data and bail
        identity = next(iter(identity), {})
        protocol = identity.get('protocol', '')

        if protocol and protocol in self.optional_params.get(
                'ignore_protocols', []):
            return None

        # remap for the reader names without "Reader"
        _remap = {
            "OAI-PMH": "OaiPmh",
            "OGC": "Ogc",
            "UNIDATA": "Thredds",
            "ISO": "Iso",
            "RDF": "Rdf",
            "FGDC": "FgdcItem"
        }
        # this is bad naming, but if the protocol value is
        # a key, get the value otherwise just hang onto it
        protocol = _remap.get(protocol, protocol)

        # we're going to pull the class name trick for the set
        # that is straightforward and hope not many are wonky
        protocol = 'Xml' if self.optional_params.get('parse_as_xml', False) \
            and not protocol else protocol

        # see if it's a loaded object
        try:
            reader_class = getattr(sys.modules[__name__], protocol + 'Reader')
        except AttributeError:
            # if it's not, we can't parse anyway
            return None

        if reader_class.__name__ in [
                'OpenSearchReader',
                'OaiPmhReader',
                'ThreddsReader',
                'IsoReader',
                'XmlReader',
                'OgcReader',
                'RdfReader']:
            # it's the standard processor init
            return reader_class(
                identity,
                response,
                url,
                harvest_details
            )
        elif reader_class.__name__ == 'FgdcItemReader':
            # TODO: this is baked into the others,
            #       we should take care of that
            parser = Parser(response)
            # TODO: don't forget to handle the harvest date
            return reader_class(parser.xml, url, harvest_details)

        return None
Пример #11
0
    def __init__(self, text, handle_html=False, include_html_hrefs=False):
        self.text = text
        self.parser = Parser(text)

        self.handle_html = handle_html
        self.include_html_hrefs = include_html_hrefs