Python Parser примеры использования

Язык программирования: Python

Пространство имен/Пакет: semproc.parser

Класс/Тип: Parser

Примеров на hotexamples.com: 11

Python Parser - 11 примеров найдено. Это лучшие примеры Python кода для semproc.parser.Parser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Parser(9)

find_nodes(1)

to_string(1)

Основные методы

Parser (9)

find_nodes (1)

to_string (1)

Пример #1

Показать файл

    def __init__(self, identity, text, url, harvest_details):
        self.text = text
        self.identity = identity
        self.url = url
        self.harvest_details = harvest_details

        # parse
        self.parser = Parser(text)
        self.parse()

Пример #2

Показать файл

Файл: text_tasks.py Проект: b-cube/semantics_pipeline

    def process_response(self, data):
        '''
        data here is just the content from the cleaned result set

        strip punctuation (modified version)
        tokenize
        strip stopwords
        '''

        def _strip_punctuation(text, simple_pattern=r'[;|>+:=#@%<?(){}`\'"]'):
            text = re.sub(simple_pattern, ' ', text)
            return text.replace("/", ' ')

        content = data['content']

        if self.include_structure:
            # include the xml tags, etc
            # note: this uses a different punctuation set
            bow = _strip_punctuation(content)
            # so this runs without error in ipy but not here.
            # TODO: fix that
            words = tokenize(bow)
            words = remove_stopwords(words)
            return words
        else:
            # pull out the text only
            parser = Parser(content)
            all_text = parser.find_nodes()

            # collapse to just text and attributes.text values
            bow = ''
            bow += ' '.join([a.get('text' '') for a in all_text])

            atts = [a.get('attributes', []) for a in all_text]
            bow += ' '.join([a.get('text' '') for a in atts])

            bow = remove_punctuation(bow)
            words = tokenize_text(bow)
            words = remove_stopwords(words)

            if len(words) < self.minimum_wordcount:
                return ''
            return ' '.join(words)

Пример #3

Показать файл

Файл: models.py Проект: roomthily/metadata-pg-pipeline

    def create(self, doc):
        # the sha (generated from url if not in doc)
        self.source_url_sha = doc.get('url_hash')
        # TODO: verify that the pipeline output contains this
        self.source_url = doc.get('url')
        # this should come from the cleaned output of
        # the pipeline so that we don't need to keep
        # dealing with really junky strings
        # TODO: switch to pipeline clean content

        # cleaned_content = self._clean(doc.get('raw_content', ''))

        # try:
        #     parser = Parser(cleaned_content)
        #     cleaned_content = etree.tostring(parser.xml, pretty_print=True)
        #     fmt = 'xml'
        # except Exception as ex:
        #     print 'xml error', ex
        #     try:
        #         clean_json = json.loads(cleaned_content)
        #         fmt = 'json'
        #     except:
        #         fmt = 'unknown'

        fmt = doc.get('response_datatype', 'unknown')
        cleaned_content = doc.get('content')

        if fmt == 'xml':
            parser = Parser(cleaned_content.encode('utf-8'))
            try:
                self.namespaces = parser.namespaces
            except Exception as ex:
                print 'namespace error', ex

            try:
                self.schemas = self._pull_schemas(parser.xml)
            except Exception as ex:
                print 'schema error', ex
                traceback.print_exc()

        self.format = fmt
        self.cleaned_content = cleaned_content

        self.raw_content = doc.get('raw_content', '')
        self.raw_content_md5 = doc.get('digest', '')
        self.initial_harvest_date = doc.get('tstamp')
        self.host = doc.get('host', '')
        self.inlinks = doc.get('inlinks', [])
        self.outlinks = doc.get('outlinks', [])
        headers = doc.get('response_headers', [])
        self.headers = dict(
            (
                k.strip(), v.strip()) for k, v in
                    (h.split(':', 1) for h in headers)
        )

Пример #4

Показать файл

    def process_response(self, data):
        '''
        data here is just the content from the cleaned result set

        strip punctuation (modified version)
        tokenize
        strip stopwords
        '''
        def _strip_punctuation(text, simple_pattern=r'[;|>+:=#@%<?(){}`\'"]'):
            text = re.sub(simple_pattern, ' ', text)
            return text.replace("/", ' ')

        content = data['content']

        if self.include_structure:
            # include the xml tags, etc
            # note: this uses a different punctuation set
            bow = _strip_punctuation(content)
            # so this runs without error in ipy but not here.
            # TODO: fix that
            words = tokenize(bow)
            words = remove_stopwords(words)
            return words
        else:
            # pull out the text only
            parser = Parser(content)
            all_text = parser.find_nodes()

            # collapse to just text and attributes.text values
            bow = ''
            bow += ' '.join([a.get('text' '') for a in all_text])

            atts = [a.get('attributes', []) for a in all_text]
            bow += ' '.join([a.get('text' '') for a in atts])

            bow = remove_punctuation(bow)
            words = tokenize_text(bow)
            words = remove_stopwords(words)

            if len(words) < self.minimum_wordcount:
                return ''
            return ' '.join(words)

Пример #5

Показать файл

Файл: identifier.py Проект: roomthily/semantics-preprocessing

 def __init__(self, yaml_files, source_content, source_url, **options):
     '''
     **options:
         parser: Parser from source_content
         ignore_case: bool
     '''
     self.yaml_files = yaml_files
     self.source_content = source_content
     self.source_url = source_url
     self.yaml = import_yaml_configs(self.yaml_files)
     self.parser = Parser(source_content)

Пример #6

Показать файл

Файл: load_oct_harvest.py Проект: roomthily/metadata-pg-pipeline

    def create(self, doc):
        # the sha (generated from url if not in doc)
        self.source_url_sha = doc.get('url_hash')
        # TODO: verify that the pipeline output contains this
        self.source_url = doc.get('url')
        fmt = doc.get('response_datatype', 'unknown')
        cleaned_content = doc.get('content')

        if fmt == 'xml':
            parser = Parser(cleaned_content.encode('utf-8'))

            if parser.xml is None:
                print self.source_url_sha, self.source_url
                fmt = 'xml;unparsed'

        #     try:
        #         self.namespaces = parser.namespaces
        #     except Exception as ex:
        #         print 'namespace error', ex

        #     try:
        #         self.schemas = self._pull_schemas(parser.xml)
        #     except Exception as ex:
        #         print 'schema error', ex
        #         traceback.print_exc()

        self.format = fmt
        self.cleaned_content = cleaned_content

        self.raw_content = doc.get('raw_content', '')
        self.raw_content_md5 = doc.get('digest', '')
        self.initial_harvest_date = doc.get('tstamp')
        self.host = doc.get('host', '')
        self.inlinks = doc.get('inlinks', [])
        self.outlinks = doc.get('outlinks', [])
        headers = doc.get('response_headers', [])
        self.headers = dict((k.strip(), v.strip())
                            for k, v in (h.split(':', 1) for h in headers))
        self.response_identity = next(iter(doc.get('identity', [])), {})

Пример #7

Показать файл

Файл: parse_tasks.py Проект: b-cube/semantics_pipeline

 def process_response(self, data):
     # do the response processing
     content = data['content'].encode('unicode_escape')
     parser = Parser(content)
     return parser.to_string()

Пример #8

Показать файл

 def _load_xml(self):
     self.parser = Parser(self._response)

Пример #9

Показать файл

 def process_response(self, data):
     # do the response processing
     content = data['content'].encode('unicode_escape')
     parser = Parser(content)
     return parser.to_string()

Пример #10

Показать файл

    def _instantiate(self, identity, response, url, harvest_details):
        '''
        set up the router
        '''
        # TODO: add a filter for known protocol with type
        #       data and bail
        identity = next(iter(identity), {})
        protocol = identity.get('protocol', '')

        if protocol and protocol in self.optional_params.get(
                'ignore_protocols', []):
            return None

        # remap for the reader names without "Reader"
        _remap = {
            "OAI-PMH": "OaiPmh",
            "OGC": "Ogc",
            "UNIDATA": "Thredds",
            "ISO": "Iso",
            "RDF": "Rdf",
            "FGDC": "FgdcItem"
        }
        # this is bad naming, but if the protocol value is
        # a key, get the value otherwise just hang onto it
        protocol = _remap.get(protocol, protocol)

        # we're going to pull the class name trick for the set
        # that is straightforward and hope not many are wonky
        protocol = 'Xml' if self.optional_params.get('parse_as_xml', False) \
            and not protocol else protocol

        # see if it's a loaded object
        try:
            reader_class = getattr(sys.modules[__name__], protocol + 'Reader')
        except AttributeError:
            # if it's not, we can't parse anyway
            return None

        if reader_class.__name__ in [
                'OpenSearchReader',
                'OaiPmhReader',
                'ThreddsReader',
                'IsoReader',
                'XmlReader',
                'OgcReader',
                'RdfReader']:
            # it's the standard processor init
            return reader_class(
                identity,
                response,
                url,
                harvest_details
            )
        elif reader_class.__name__ == 'FgdcItemReader':
            # TODO: this is baked into the others,
            #       we should take care of that
            parser = Parser(response)
            # TODO: don't forget to handle the harvest date
            return reader_class(parser.xml, url, harvest_details)

        return None

Пример #11

Показать файл

    def __init__(self, text, handle_html=False, include_html_hrefs=False):
        self.text = text
        self.parser = Parser(text)

        self.handle_html = handle_html
        self.include_html_hrefs = include_html_hrefs