예제 #1
0
    def __init__(self,
                 parselet,
                 selector_handler=None,
                 strict=False,
                 debug=False):
        """
        Take a parselet and optional selector_handler
        and build an abstract representation of the Parsley extraction
        logic.

        Two helper class methods can be used to instantiate a Parselet
        from JSON rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`.

        :param dict parselet: Parsley script as a Python dict object
        :param boolean strict: Set to *True* is you want to
            enforce that missing required keys raise an Exception; default is False
            (i.e. lenient/non-strict mode)
        :param selector_handler: an instance of :class:`selectors.SelectorHandler`
            optional selector handler instance;
            defaults to an instance of :class:`selectors.DefaultSelectorHandler`
        :raises: :class:`.InvalidKeySyntax`

        Example:

        >>> import parslepy
        >>> rules = {
        ...     "heading": "h1#main",
        ...     "news(li.newsitem)": [{
        ...         "title": ".",
        ...         "url": "a/@href"
        ...     }],
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> type(p)
        <class 'parslepy.base.Parselet'>

        Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse`
        to get extracted content from documents.
        """

        if debug:
            self.DEBUG = True
        if strict:
            self.STRICT_MODE = True

        self.parselet = parselet

        if not selector_handler:
            self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG)

        elif not (isinstance(selector_handler, SelectorHandler)):
            raise ValueError("You must provide a SelectorHandler instance")

        else:
            self.selector_handler = selector_handler

        self.compile()
예제 #2
0
파일: base.py 프로젝트: nerixim/parslepy
    def __init__(self, parselet, selector_handler=None, strict=False, debug=False):
        """
        Take a parselet and optional selector_handler
        and build an abstract representation of the Parsley extraction
        logic.

        Four helper class methods can be used to instantiate a Parselet
        from JSON/YAML rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`,
        :meth:`.from_yamlstring`, :meth:`.from_yamlfile`.

        :param dict parselet: Parsley script as a Python dict object
        :param boolean strict: Set to *True* is you want to
            enforce that missing required keys raise an Exception; default is False
            (i.e. lenient/non-strict mode)
        :param selector_handler: an instance of :class:`selectors.SelectorHandler`
            optional selector handler instance;
            defaults to an instance of :class:`selectors.DefaultSelectorHandler`
        :raises: :class:`.InvalidKeySyntax`

        Example:

        >>> import parslepy
        >>> rules = {
        ...     "heading": "h1#main",
        ...     "news(li.newsitem)": [{
        ...         "title": ".",
        ...         "url": "a/@href"
        ...     }],
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> type(p)
        <class 'parslepy.base.Parselet'>

        Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse`
        to get extracted content from documents.
        """

        if debug:
            self.DEBUG = True
        if strict:
            self.STRICT_MODE = True

        self.parselet =  parselet

        if not selector_handler:
            self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG)

        elif not(isinstance(selector_handler, SelectorHandler)):
            raise ValueError("You must provide a SelectorHandler instance")

        else:
            self.selector_handler = selector_handler

        self.compile()
예제 #3
0
class Parselet(object):

    DEBUG = False
    SPECIAL_LEVEL_KEY = "--"
    KEEP_ONLY_FIRST_ELEMENT_IF_LIST = True
    STRICT_MODE = False

    def __init__(self,
                 parselet,
                 selector_handler=None,
                 strict=False,
                 debug=False):
        """
        Take a parselet and optional selector_handler
        and build an abstract representation of the Parsley extraction
        logic.

        Two helper class methods can be used to instantiate a Parselet
        from JSON rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`.

        :param dict parselet: Parsley script as a Python dict object
        :param boolean strict: Set to *True* is you want to
            enforce that missing required keys raise an Exception; default is False
            (i.e. lenient/non-strict mode)
        :param selector_handler: an instance of :class:`selectors.SelectorHandler`
            optional selector handler instance;
            defaults to an instance of :class:`selectors.DefaultSelectorHandler`
        :raises: :class:`.InvalidKeySyntax`

        Example:

        >>> import parslepy
        >>> rules = {
        ...     "heading": "h1#main",
        ...     "news(li.newsitem)": [{
        ...         "title": ".",
        ...         "url": "a/@href"
        ...     }],
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> type(p)
        <class 'parslepy.base.Parselet'>

        Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse`
        to get extracted content from documents.
        """

        if debug:
            self.DEBUG = True
        if strict:
            self.STRICT_MODE = True

        self.parselet = parselet

        if not selector_handler:
            self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG)

        elif not (isinstance(selector_handler, SelectorHandler)):
            raise ValueError("You must provide a SelectorHandler instance")

        else:
            self.selector_handler = selector_handler

        self.compile()

    # accept comments in parselets
    REGEX_COMMENT_LINE = re.compile(r'^\s*#')

    @classmethod
    def from_jsonfile(cls,
                      fp,
                      selector_handler=None,
                      strict=False,
                      debug=False):
        """
        Create a Parselet instance from a file containing
        the Parsley script as a JSON object

        >>> import parslepy
        >>> with open('parselet.json') as fp:
        ...     parslepy.Parselet.from_jsonfile(fp)
        ...
        <parslepy.base.Parselet object at 0x2014e50>

        :param file fp: an open file-like pointer containing the Parsley script
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        return cls._from_jsonlines(fp,
                                   selector_handler=selector_handler,
                                   strict=strict,
                                   debug=debug)

    @classmethod
    def from_jsonstring(cls,
                        s,
                        selector_handler=None,
                        strict=False,
                        debug=False):
        """
        Create a Parselet instance from s (str) containing
        the Parsley script as JSON

        >>> import parslepy
        >>> parsley_string = '{ "title": "h1", "link": "a @href"}'
        >>> p = parslepy.Parselet.from_jsonstring(parsley_string)
        >>> type(p)
        <class 'parslepy.base.Parselet'>
        >>>

        :param string s: a Parsley script as a JSON string
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        return cls._from_jsonlines(s.split("\n"),
                                   selector_handler=selector_handler,
                                   strict=strict,
                                   debug=debug)

    @classmethod
    def _from_jsonlines(cls,
                        lines,
                        selector_handler=None,
                        strict=False,
                        debug=False):
        """
        Interpret input lines as a JSON Parsley script.
        Python-style comment lines are skipped.
        """

        return cls(json.loads("\n".join(
            [l for l in lines if not cls.REGEX_COMMENT_LINE.match(l)])),
                   selector_handler=selector_handler,
                   strict=strict,
                   debug=debug)

    def parse(self, fp, parser=None, context=None):
        """
        Parse an HTML or XML document and
        return the extacted object following the Parsley rules give at instantiation.

        :param fp: file-like object containing an HTML or XML document, or URL or filename
        :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python :class:`dict` object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        To parse from a string, use the :meth:`~base.Parselet.parse_fromstring` method instead.

        Note that the fp paramater is passed directly
        to `lxml.etree.parse <http://lxml.de/api/lxml.etree-module.html#parse>`_,
        so you can also give it an URL, and lxml will download it for you.
        (Also see `<http://lxml.de/tutorial.html#the-parse-function>`_.)
        """

        if parser is None:
            parser = lxml.etree.HTMLParser()
        doc = lxml.etree.parse(fp, parser=parser).getroot()
        return self.extract(doc, context=context)

    def parse_fromstring(self, s, parser=None, context=None):
        """
        Parse an HTML or XML document and
        return the extacted object following the Parsley rules give at instantiation.

        :param string s: an HTML or XML document as a string
        :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python :class:`dict` object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        """
        if parser is None:
            parser = lxml.etree.HTMLParser()
        doc = lxml.etree.fromstring(s, parser=parser)
        return self.extract(doc, context=context)

    def compile(self):
        """
        Build the abstract Parsley tree starting from the root node
        (recursive)
        """
        if not isinstance(self.parselet, dict):
            raise ValueError(
                "Parselet must be a dict of some sort. Or use .from_jsonstring() or .from_jsonfile()"
            )
        self.parselet_tree = self._compile(self.parselet)

    VALID_KEY_CHARS = "\w-"
    SUPPORTED_OPERATORS = "?"  # "!" not supported for now
    REGEX_PARSELET_KEY = re.compile(
        "^(?P<key>[%(validkeychars)s]+)(?P<operator>[%(suppop)s])?(\((?P<scope>.+)\))?$"
        % {
            'validkeychars': VALID_KEY_CHARS,
            'suppop': SUPPORTED_OPERATORS
        })

    def _compile(self, parselet_node, level=0):
        """
        Build part of the abstract Parsley extraction tree

        Arguments:
        parselet_node (dict) -- part of the Parsley tree to compile
                                (can be the root dict/node)
        level (int)          -- current recursion depth (used for debug)
        """

        if self.DEBUG:
            debug_offset = "".join(["    " for x in range(level)])

        if self.DEBUG:
            print(debug_offset,
                  "%s::compile(%s)" % (self.__class__.__name__, parselet_node))

        if isinstance(parselet_node, dict):
            parselet_tree = ParsleyNode()
            for k, v in list(parselet_node.items()):

                # we parse the key raw elements but without much
                # interpretation (which is done by the SelectorHandler)
                try:
                    m = self.REGEX_PARSELET_KEY.match(k)
                    if not m:
                        if self.DEBUG:
                            print(debug_offset, "could not parse key", k)
                        raise InvalidKeySyntax(k)
                except:
                    raise InvalidKeySyntax("Key %s is not valid" % k)

                key = m.group('key')
                # by default, fields are required
                key_required = True
                operator = m.group('operator')
                if operator == '?':
                    key_required = False
                # FIXME: "!" operator not supported (complete array)
                scope = m.group('scope')

                # example: get list of H3 tags
                # { "titles": ["h3"] }
                # FIXME: should we support multiple selectors in list?
                #        e.g. { "titles": ["h1", "h2", "h3", "h4"] }
                if isinstance(v, (list, tuple)):
                    v = v[0]
                    iterate = True
                else:
                    iterate = False

                # keys in the abstract Parsley trees are of type `ParsleyContext`
                try:
                    parsley_context = ParsleyContext(
                        key,
                        operator=operator,
                        required=key_required,
                        scope=self.selector_handler.make(scope)
                        if scope else None,
                        iterate=iterate)
                except SyntaxError:
                    if self.DEBUG:
                        print("Invalid scope:", k, scope)
                    raise

                if self.DEBUG:
                    print(debug_offset, "current context:", parsley_context)

                # go deeper in the Parsley tree...
                try:
                    child_tree = self._compile(v, level=level + 1)
                except SyntaxError:
                    if self.DEBUG:
                        print("Invalid value: ", v)
                    raise
                except:
                    raise

                if self.DEBUG:
                    print(debug_offset, "child tree:", child_tree)

                parselet_tree[parsley_context] = child_tree

            return parselet_tree

        # a string leaf should match some kind of selector,
        # let the selector handler deal with it
        elif isstr(parselet_node):
            return self.selector_handler.make(parselet_node)
        else:
            raise ValueError("Unsupported type(%s) for Parselet node <%s>" %
                             (type(parselet_node), parselet_node))

    def extract(self, document, context=None):
        """
        Extract values as a dict object following the structure
        of the Parsley script (recursive)

        :param document: lxml-parsed document
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python *dict* object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        >>> import lxml.etree
        >>> import parslepy
        >>> html = '''
        ... <!DOCTYPE html>
        ... <html>
        ... <head>
        ...     <title>Sample document to test parslepy</title>
        ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
        ... </head>
        ... <body>
        ... <h1 id="main">What&rsquo;s new</h1>
        ... <ul>
        ...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
        ...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
        ...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
        ... </ul>
        ... </body>
        ... </html>
        ... '''
        >>> html_parser = lxml.etree.HTMLParser()
        >>> doc = lxml.etree.fromstring(html, parser=html_parser)
        >>> doc
        <Element html at 0x7f5fb1fce9b0>
        >>> rules = {
        ...     "headingcss": "#main",
        ...     "headingxpath": "//h1[@id='main']"
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> p.extract(doc)
        {'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'}

        """
        if context:
            self.selector_handler.context = context
        return self._extract(self.parselet_tree, document)

    def _extract(self, parselet_node, document, level=0):
        """
        Extract values at this document node level
        using the parselet_node instructions:
        - go deeper in tree
        - or call selector handler in case of a terminal selector leaf
        """

        if self.DEBUG:
            debug_offset = "".join(["    " for x in range(level)])

        # we must go deeper in the Parsley tree
        if isinstance(parselet_node, ParsleyNode):

            # default output
            output = {}

            # process all children
            for ctx, v in list(parselet_node.items()):
                if self.DEBUG:
                    print(debug_offset, "context:", ctx, v)
                extracted = None
                try:
                    # scoped-extraction:
                    # extraction should be done deeper in the document tree
                    if ctx.scope:
                        extracted = []
                        selected = self.selector_handler.select(
                            document, ctx.scope)
                        if selected:
                            for i, elem in enumerate(selected, start=1):
                                parse_result = self._extract(v,
                                                             elem,
                                                             level=level + 1)

                                if isinstance(parse_result, (list, tuple)):
                                    extracted.extend(parse_result)
                                else:
                                    extracted.append(parse_result)

                                # if we're not in an array,
                                # we only care about the first iteration
                                if not ctx.iterate:
                                    break

                            if self.DEBUG:
                                print(
                                    debug_offset,
                                    "parsed %d elements in scope (%s)" %
                                    (i, ctx.scope))

                    # local extraction
                    else:
                        extracted = self._extract(v, document, level=level + 1)

                except NonMatchingNonOptionalKey as e:
                    if self.DEBUG:
                        print(debug_offset, str(e))
                    if not ctx.required or not self.STRICT_MODE:
                        output[ctx.key] = {}
                    else:
                        raise
                except Exception as e:
                    if self.DEBUG:
                        print(str(e))
                    raise

                # replace empty-list result when not looping by empty dict
                if (isinstance(extracted, list) and not extracted
                        and not ctx.iterate):
                    extracted = {}

                # keep only the first element if we're not in an array
                if self.KEEP_ONLY_FIRST_ELEMENT_IF_LIST:
                    try:
                        if (isinstance(extracted, list) and extracted
                                and not ctx.iterate):

                            if self.DEBUG:
                                print(debug_offset, "keep only 1st element")
                            extracted = extracted[0]

                    except Exception as e:
                        if self.DEBUG:
                            print(str(e))
                            print(debug_offset, "error getting first element")

                # extraction for a required key gave nothing
                if (self.STRICT_MODE and ctx.required and extracted is None):
                    raise NonMatchingNonOptionalKey(
                        'key "%s" is required but yield nothing\nCurrent path: %s/(%s)\n'
                        %
                        (ctx.key, document.getroottree().getpath(document), v))

                # special key to extract a selector-defined level deeper
                # but still output at same level
                # this can be useful for breaking up long selectors
                # or when you need to mix XPath and CSS selectors
                # e.g.
                # {
                #   "something(#content div.main)": {
                #       "--(.//div[re:test(@class, 'style\d{3,6}')])": {
                #           "title": "h1",
                #           "subtitle": "h2"
                #       }
                #   }
                # }
                #
                if ctx.key == self.SPECIAL_LEVEL_KEY:
                    if isinstance(extracted, dict):
                        output.update(extracted)
                    elif isinstance(extracted, list):
                        if extracted:
                            raise RuntimeError(
                                "could not merge non-empty list at higher level"
                            )
                        else:
                            #empty list, dont bother?
                            pass
                else:
                    # required keys are handled above
                    if extracted is not None:
                        output[ctx.key] = extracted
                    else:
                        # do not add this optional key/value pair in the output
                        pass

            return output

        # a leaf/Selector node
        elif isinstance(parselet_node, Selector):
            return self.selector_handler.extract(document, parselet_node)

        else:
            # FIXME: can this happen?
            #        if selector handler returned None at compile time,
            #        probably yes
            pass

    def keys(self):
        """
        Return a list of 1st level keys of the output data model

        >>> import parslepy
        >>> rules = {
        ...     "headingcss": "#main",
        ...     "headingxpath": "//h1[@id='main']"
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> sorted(p.keys())
        ['headingcss', 'headingxpath']

        """
        return self._keys(self.parselet_tree)

    def _keys(self, parselet_node):
        keys = []
        if isinstance(parselet_node, ParsleyNode):
            for ctx, v in list(parselet_node.items()):
                if ctx.key == self.SPECIAL_LEVEL_KEY:
                    keys.extend(self._keys(v))
                else:
                    keys.append(ctx.key)
        return keys
예제 #4
0
파일: base.py 프로젝트: nerixim/parslepy
class Parselet(object):

    DEBUG = False
    SPECIAL_LEVEL_KEY = "--"
    KEEP_ONLY_FIRST_ELEMENT_IF_LIST = True
    STRICT_MODE = False

    def __init__(self, parselet, selector_handler=None, strict=False, debug=False):
        """
        Take a parselet and optional selector_handler
        and build an abstract representation of the Parsley extraction
        logic.

        Four helper class methods can be used to instantiate a Parselet
        from JSON/YAML rules: :meth:`.from_jsonstring`, :meth:`.from_jsonfile`,
        :meth:`.from_yamlstring`, :meth:`.from_yamlfile`.

        :param dict parselet: Parsley script as a Python dict object
        :param boolean strict: Set to *True* is you want to
            enforce that missing required keys raise an Exception; default is False
            (i.e. lenient/non-strict mode)
        :param selector_handler: an instance of :class:`selectors.SelectorHandler`
            optional selector handler instance;
            defaults to an instance of :class:`selectors.DefaultSelectorHandler`
        :raises: :class:`.InvalidKeySyntax`

        Example:

        >>> import parslepy
        >>> rules = {
        ...     "heading": "h1#main",
        ...     "news(li.newsitem)": [{
        ...         "title": ".",
        ...         "url": "a/@href"
        ...     }],
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> type(p)
        <class 'parslepy.base.Parselet'>

        Use :meth:`~base.Parselet.extract` or :meth:`~base.Parselet.parse`
        to get extracted content from documents.
        """

        if debug:
            self.DEBUG = True
        if strict:
            self.STRICT_MODE = True

        self.parselet =  parselet

        if not selector_handler:
            self.selector_handler = DefaultSelectorHandler(debug=self.DEBUG)

        elif not(isinstance(selector_handler, SelectorHandler)):
            raise ValueError("You must provide a SelectorHandler instance")

        else:
            self.selector_handler = selector_handler

        self.compile()

    # accept comments in parselets
    REGEX_COMMENT_LINE = re.compile(r'^\s*#')
    @classmethod
    def from_jsonfile(cls, fp, selector_handler=None, strict=False, debug=False):
        """
        Create a Parselet instance from a file containing
        the Parsley script as a JSON object

        >>> import parslepy
        >>> with open('parselet.json') as fp:
        ...     parslepy.Parselet.from_jsonfile(fp)
        ...
        <parslepy.base.Parselet object at 0x2014e50>

        :param file fp: an open file-like pointer containing the Parsley script
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        return cls._from_jsonlines(fp,
            selector_handler=selector_handler, strict=strict, debug=debug)

    @classmethod
    def from_yamlfile(cls, fp, selector_handler=None, strict=False, debug=False):
        """
        Create a Parselet instance from a file containing
        the Parsley script as a YAML object

        >>> import parslepy
        >>> with open('parselet.yml') as fp:
        ...     parslepy.Parselet.from_yamlfile(fp)
        ...
        <parslepy.base.Parselet object at 0x2014e50>

        :param file fp: an open file-like pointer containing the Parsley script
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        return cls.from_yamlstring(fp.read(), selector_handler=selector_handler, strict=strict, debug=debug)

    @classmethod
    def from_yamlstring(cls, s, selector_handler=None, strict=False, debug=False):
        """
        Create a Parselet instance from s (str) containing
        the Parsley script as YAML

        >>> import parslepy
        >>> parsley_string = '''---
            title: h1
            link: a @href
        '''
        >>> p = parslepy.Parselet.from_yamlstring(parsley_string)
        >>> type(p)
        <class 'parslepy.base.Parselet'>
        >>>

        :param string s: a Parsley script as a YAML string
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        import yaml
        return cls(yaml.load(s), selector_handler=selector_handler, strict=strict, debug=debug)

    @classmethod
    def from_jsonstring(cls, s, selector_handler=None, strict=False, debug=False):
        """
        Create a Parselet instance from s (str) containing
        the Parsley script as JSON

        >>> import parslepy
        >>> parsley_string = '{ "title": "h1", "link": "a @href"}'
        >>> p = parslepy.Parselet.from_jsonstring(parsley_string)
        >>> type(p)
        <class 'parslepy.base.Parselet'>
        >>>

        :param string s: a Parsley script as a JSON string
        :rtype: :class:`.Parselet`

        Other arguments: same as for :class:`.Parselet` contructor
        """

        return cls._from_jsonlines(s.split("\n"),
            selector_handler=selector_handler, strict=strict, debug=debug)

    @classmethod
    def _from_jsonlines(cls, lines, selector_handler=None, strict=False, debug=False):
        """
        Interpret input lines as a JSON Parsley script.
        Python-style comment lines are skipped.
        """

        return cls(json.loads(
                "\n".join([l for l in lines if not cls.REGEX_COMMENT_LINE.match(l)])
            ), selector_handler=selector_handler, strict=strict, debug=debug)

    def parse(self, fp, parser=None, context=None):
        """
        Parse an HTML or XML document and
        return the extacted object following the Parsley rules give at instantiation.

        :param fp: file-like object containing an HTML or XML document, or URL or filename
        :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python :class:`dict` object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        To parse from a string, use the :meth:`~base.Parselet.parse_fromstring` method instead.

        Note that the fp paramater is passed directly
        to `lxml.etree.parse <http://lxml.de/api/lxml.etree-module.html#parse>`_,
        so you can also give it an URL, and lxml will download it for you.
        (Also see `<http://lxml.de/tutorial.html#the-parse-function>`_.)
        """

        if parser is None:
            parser = lxml.etree.HTMLParser()
        doc = lxml.etree.parse(fp, parser=parser).getroot()
        return self.extract(doc, context=context)

    def parse_fromstring(self, s, parser=None, context=None):
        """
        Parse an HTML or XML document and
        return the extacted object following the Parsley rules give at instantiation.

        :param string s: an HTML or XML document as a string
        :param parser: *lxml.etree._FeedParser* instance (optional); defaults to lxml.etree.HTMLParser()
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python :class:`dict` object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        """
        if parser is None:
            parser = lxml.etree.HTMLParser()
        doc = lxml.etree.fromstring(s, parser=parser)
        return self.extract(doc, context=context)

    def compile(self):
        """
        Build the abstract Parsley tree starting from the root node
        (recursive)
        """
        if not isinstance(self.parselet, dict):
            raise ValueError("Parselet must be a dict of some sort. Or use .from_jsonstring(), " \
                ".from_jsonfile(), .from_yamlstring(), or .from_yamlfile()")
        self.parselet_tree = self._compile(self.parselet)

    VALID_KEY_CHARS = "\w-"
    SUPPORTED_OPERATORS = "?"   # "!" not supported for now
    REGEX_PARSELET_KEY = re.compile(
        "^(?P<key>[%(validkeychars)s]+)(?P<operator>[%(suppop)s])?(\((?P<scope>.+)\))?$" % {
            'validkeychars': VALID_KEY_CHARS,
            'suppop': SUPPORTED_OPERATORS}
        )
    def _compile(self, parselet_node, level=0):
        """
        Build part of the abstract Parsley extraction tree

        Arguments:
        parselet_node (dict) -- part of the Parsley tree to compile
                                (can be the root dict/node)
        level (int)          -- current recursion depth (used for debug)
        """

        if self.DEBUG:
            debug_offset = "".join(["    " for x in range(level)])

        if self.DEBUG:
            print(debug_offset, "%s::compile(%s)" % (
                self.__class__.__name__, parselet_node))

        if isinstance(parselet_node, dict):
            parselet_tree = ParsleyNode()
            for k, v in list(parselet_node.items()):

                # we parse the key raw elements but without much
                # interpretation (which is done by the SelectorHandler)
                try:
                    m = self.REGEX_PARSELET_KEY.match(k)
                    if not m:
                        if self.DEBUG:
                            print(debug_offset, "could not parse key", k)
                        raise InvalidKeySyntax(k)
                except:
                    raise InvalidKeySyntax("Key %s is not valid" % k)

                key = m.group('key')
                # by default, fields are required
                key_required = True
                operator = m.group('operator')
                if operator == '?':
                    key_required = False
                # FIXME: "!" operator not supported (complete array)
                scope = m.group('scope')

                # example: get list of H3 tags
                # { "titles": ["h3"] }
                # FIXME: should we support multiple selectors in list?
                #        e.g. { "titles": ["h1", "h2", "h3", "h4"] }
                if isinstance(v, (list, tuple)):
                    v = v[0]
                    iterate = True
                else:
                    iterate = False

                # keys in the abstract Parsley trees are of type `ParsleyContext`
                try:
                    parsley_context = ParsleyContext(
                        key,
                        operator=operator,
                        required=key_required,
                        scope=self.selector_handler.make(scope) if scope else None,
                        iterate=iterate)
                except SyntaxError:
                    if self.DEBUG:
                        print("Invalid scope:", k, scope)
                    raise

                if self.DEBUG:
                    print(debug_offset, "current context:", parsley_context)

                # go deeper in the Parsley tree...
                try:
                    child_tree = self._compile(v, level=level+1)
                except SyntaxError:
                    if self.DEBUG:
                        print("Invalid value: ", v)
                    raise
                except:
                    raise

                if self.DEBUG:
                    print(debug_offset, "child tree:", child_tree)

                parselet_tree[parsley_context] = child_tree

            return parselet_tree

        # a string leaf should match some kind of selector,
        # let the selector handler deal with it
        elif isstr(parselet_node):
            return self.selector_handler.make(parselet_node)
        else:
            raise ValueError(
                    "Unsupported type(%s) for Parselet node <%s>" % (
                        type(parselet_node), parselet_node))

    def extract(self, document, context=None):
        """
        Extract values as a dict object following the structure
        of the Parsley script (recursive)

        :param document: lxml-parsed document
        :param context: user-supplied context that will be passed to custom XPath extensions (as first argument)
        :rtype: Python *dict* object with mapped extracted content
        :raises: :class:`.NonMatchingNonOptionalKey`

        >>> import lxml.etree
        >>> import parslepy
        >>> html = '''
        ... <!DOCTYPE html>
        ... <html>
        ... <head>
        ...     <title>Sample document to test parslepy</title>
        ...     <meta http-equiv="content-type" content="text/html;charset=utf-8" />
        ... </head>
        ... <body>
        ... <h1 id="main">What&rsquo;s new</h1>
        ... <ul>
        ...     <li class="newsitem"><a href="/article-001.html">This is the first article</a></li>
        ...     <li class="newsitem"><a href="/article-002.html">A second report on something</a></li>
        ...     <li class="newsitem"><a href="/article-003.html">Python is great!</a> <span class="fresh">New!</span></li>
        ... </ul>
        ... </body>
        ... </html>
        ... '''
        >>> html_parser = lxml.etree.HTMLParser()
        >>> doc = lxml.etree.fromstring(html, parser=html_parser)
        >>> doc
        <Element html at 0x7f5fb1fce9b0>
        >>> rules = {
        ...     "headingcss": "#main",
        ...     "headingxpath": "//h1[@id='main']"
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> p.extract(doc)
        {'headingcss': u'What\u2019s new', 'headingxpath': u'What\u2019s new'}

        """
        if context:
            self.selector_handler.context = context
        return self._extract(self.parselet_tree, document)

    def _extract(self, parselet_node, document, level=0):
        """
        Extract values at this document node level
        using the parselet_node instructions:
        - go deeper in tree
        - or call selector handler in case of a terminal selector leaf
        """

        if self.DEBUG:
            debug_offset = "".join(["    " for x in range(level)])

        # we must go deeper in the Parsley tree
        if isinstance(parselet_node, ParsleyNode):

            # default output
            output = {}

            # process all children
            for ctx, v in list(parselet_node.items()):
                if self.DEBUG:
                    print(debug_offset, "context:", ctx, v)
                extracted=None
                try:
                    # scoped-extraction:
                    # extraction should be done deeper in the document tree
                    if ctx.scope:
                        extracted = []
                        selected = self.selector_handler.select(document, ctx.scope)
                        if selected:
                            for i, elem in enumerate(selected, start=1):
                                parse_result = self._extract(v, elem, level=level+1)

                                if isinstance(parse_result, (list, tuple)):
                                    extracted.extend(parse_result)
                                else:
                                    extracted.append(parse_result)

                                # if we're not in an array,
                                # we only care about the first iteration
                                if not ctx.iterate:
                                    break

                            if self.DEBUG:
                                print(debug_offset,
                                    "parsed %d elements in scope (%s)" % (i, ctx.scope))

                    # local extraction
                    else:
                        extracted = self._extract(v, document, level=level+1)

                except NonMatchingNonOptionalKey as e:
                    if self.DEBUG:
                        print(debug_offset, str(e))
                    if not ctx.required or not self.STRICT_MODE:
                        output[ctx.key] = {}
                    else:
                        raise
                except Exception as e:
                    if self.DEBUG:
                        print(str(e))
                    raise

                # replace empty-list result when not looping by empty dict
                if (    isinstance(extracted, list)
                    and not extracted
                    and not ctx.iterate):
                        extracted = {}

                # keep only the first element if we're not in an array
                if self.KEEP_ONLY_FIRST_ELEMENT_IF_LIST:
                    try:
                        if (    isinstance(extracted, list)
                            and extracted
                            and not ctx.iterate):

                            if self.DEBUG:
                                print(debug_offset, "keep only 1st element")
                            extracted =  extracted[0]

                    except Exception as e:
                        if self.DEBUG:
                            print(str(e))
                            print(debug_offset, "error getting first element")

                # extraction for a required key gave nothing
                if (    self.STRICT_MODE
                    and ctx.required
                    and extracted is None):
                    raise NonMatchingNonOptionalKey(
                        'key "%s" is required but yield nothing\nCurrent path: %s/(%s)\n' % (
                            ctx.key,
                            document.getroottree().getpath(document),v
                            )
                        )

                # special key to extract a selector-defined level deeper
                # but still output at same level
                # this can be useful for breaking up long selectors
                # or when you need to mix XPath and CSS selectors
                # e.g.
                # {
                #   "something(#content div.main)": {
                #       "--(.//div[re:test(@class, 'style\d{3,6}')])": {
                #           "title": "h1",
                #           "subtitle": "h2"
                #       }
                #   }
                # }
                #
                if ctx.key == self.SPECIAL_LEVEL_KEY:
                    if isinstance(extracted, dict):
                        output.update(extracted)
                    elif isinstance(extracted, list):
                        if extracted:
                            raise RuntimeError(
                                "could not merge non-empty list at higher level")
                        else:
                            #empty list, dont bother?
                            pass
                else:
                    # required keys are handled above
                    if extracted is not None:
                        output[ctx.key] = extracted
                    else:
                        # do not add this optional key/value pair in the output
                        pass

            return output

        # a leaf/Selector node
        elif isinstance(parselet_node, Selector):
            return self.selector_handler.extract(document, parselet_node)

        else:
            # FIXME: can this happen?
            #        if selector handler returned None at compile time,
            #        probably yes
            pass

    def keys(self):
        """
        Return a list of 1st level keys of the output data model

        >>> import parslepy
        >>> rules = {
        ...     "headingcss": "#main",
        ...     "headingxpath": "//h1[@id='main']"
        ... }
        >>> p = parslepy.Parselet(rules)
        >>> sorted(p.keys())
        ['headingcss', 'headingxpath']

        """
        return self._keys(self.parselet_tree)

    def _keys(self, parselet_node):
        keys = []
        if isinstance(parselet_node, ParsleyNode):
            for ctx, v in list(parselet_node.items()):
                if ctx.key == self.SPECIAL_LEVEL_KEY:
                    keys.extend(self._keys(v))
                else:
                    keys.append(ctx.key)
        return keys