Exemplo n.º 1
0
 def __init__(self):
     """Intialize internal variables"""
     MyParser.__init__(self)
     self.div_level = 0
     self.div_bookmark = [-1]  # List managed as a stack
     self.state = ["none"]  # List managed as a stack
     self.current_state = "none"  # Point to the top of the stack
     # map for div tag attribute -> state
     # (attribute name, attribute property, state)
     self.div_state_map = \
         [
         ('id', 'page-title', 'title'),
         ('id', 'breadcrumbs', 'breadcrumbs'),
         ('id', 'page-content', 'body'),
         ('id', 'toc-action-bar', 'useless'),
         ('id', 'toc', 'toc'),
         ('style','position:absolute', 'useless')]
     self.page_title = ""
     self.toc = ""
     self.links = OrderedSet()
     self.breadcrumbs = list()
Exemplo n.º 2
0
Arquivo: parser.py Projeto: Juxi/aseba
 def __init__(self):
     """Intialize internal variables"""
     MyParser.__init__(self)
     self.div_level = 0
     self.div_bookmark = [-1]    # List managed as a stack
     self.state = ["none"]       # List managed as a stack
     self.current_state = "none" # Point to the top of the stack
     # map for div tag attribute -> state
     # (attribute name, attribute property, state)
     self.div_state_map = \
         [
         ('id', 'page-title', 'title'),
         ('id', 'breadcrumbs', 'breadcrumbs'),
         ('id', 'page-content', 'body'),
         ('id', 'toc-action-bar', 'useless'),
         ('id', 'toc', 'toc'),
         ('style','position:absolute', 'useless')]
     self.page_title = ""
     self.toc = ""
     self.links = OrderedSet()
     self.breadcrumbs = list()
Exemplo n.º 3
0
class WikidotParser(MyParser):
    """WikidotParser is used to clean a page from www.wikidot.com,
    keeping only the interesting content."""
    def __init__(self):
        """Intialize internal variables"""
        MyParser.__init__(self)
        self.div_level = 0
        self.div_bookmark = [-1]  # List managed as a stack
        self.state = ["none"]  # List managed as a stack
        self.current_state = "none"  # Point to the top of the stack
        # map for div tag attribute -> state
        # (attribute name, attribute property, state)
        self.div_state_map = \
            [
            ('id', 'page-title', 'title'),
            ('id', 'breadcrumbs', 'breadcrumbs'),
            ('id', 'page-content', 'body'),
            ('id', 'toc-action-bar', 'useless'),
            ('id', 'toc', 'toc'),
            ('style','position:absolute', 'useless')]
        self.page_title = ""
        self.toc = ""
        self.links = OrderedSet()
        self.breadcrumbs = list()

    # Public interface
    def get_doc(self):
        """Retrieve the parsed and cleaned document"""
        # format the TOC
        if self.toc != "":
            self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc
            self.toc += "</td></tr></table>"
        # Add header
        header_template = Template(header)
        self.out_doc = header_template.substitute(title=self.page_title,
                                                  toc=self.toc) + self.out_doc
        # Add footer
        self.out_doc += footer
        return self.out_doc

    def get_links(self):
        """Retrieve the links embedded in the page (including images)"""
        return self.links

    def get_title(self):
        return self.page_title

    def get_breadcrumbs(self):
        return self.breadcrumbs

    # Inherited functions
    def handle_starttag(self, tag, attrs):
        """Overridden - Called when a start tag is parsed

        The heart of this function is the state machine.
        When a <div> tag is detected, the attributes are compared with
        a map of the form (name,value) -> state. If a match occurs,
        the state is pushed on top of the stack.

        Depending on the current state, the start tag is queued for output,
        or not."""
        # Debug
        if wikidot.debug.ENABLE_DEBUG == True:
            print >> sys.stderr, "<{}> {}".format(tag, attrs)

        # Update the state machine
        state_changed = self.__update_state_machine_start__(tag, attrs)

        if (state_changed == True) and (self.current_state == "body"):
            # We have just entered the body, don't output this <div> tag
            return
        if self.current_state == "body":
            # Handle special tags
            self.__handle_body_tag__(tag, attrs)
            # Add the tag to output
            MyParser.handle_starttag(self, tag, attrs)
        elif self.current_state == "toc":
            # Handle the content of the TOC
            self.toc += MyParser.format_start_tag(self, tag, attrs)
        elif (self.current_state == "breadcrumbs") and (tag == 'a'):
            # Register the breadcrumbs
            for attr in attrs:
                if (attr[0] == 'href'):
                    self.breadcrumbs.append(attr[1])
                    break

    def handle_endtag(self, tag):
        """Overridden - Called when an end tag is parsed

        The state machine is updated when a </div> tag is encountered.
        Depending on the current state, the end tag is queued for output,
        or not."""
        if self.current_state == "toc":
            # Add the tag to the TOC
            self.toc += MyParser.format_end_tag(self, tag)

        # Update the state machine
        state_changed = self.__update_state_machine_end__(tag)
        if state_changed == True:
            return

        if self.current_state == "body":
            # Add the tag to output
            MyParser.handle_endtag(self, tag)

    def handle_data(self, data):
        """Overridden - Called when some data is parsed

        Depending on the current state, the data is queued for output,
        or not."""
        if self.current_state == "title":
            # Register the title
            self.page_title += data.strip()
        elif self.current_state == "body":
            # Add data to the output
            MyParser.handle_data(self, data)
        elif self.current_state == "toc":
            # Add data to the TOC
            self.toc += data

    def handle_charref(self, name):
        """Overridden - Called when a charref (&#xyz) is parsed

        Depending on the current state, the charref is queued for output,
        or not."""
        if self.current_state == "title":
            # Add charref to the title
            self.page_title += ("&#" + name + ";")
        elif self.current_state == "body":
            # Add charref to the output
            MyParser.handle_charref(self, name)
        elif self.current_state == "toc":
            # Add charref to the TOC
            self.toc += ("&#" + name + ";")

    def handle_entityref(self, name):
        """Overridden - Called when an entityref (&xyz) tag is parsed

        Depending on the current state, the entityref is queued for output,
        or not."""
        if self.current_state == "title":
            # Add the entityref to the title
            self.page_title += ("&" + name + ";")
        elif self.current_state == "body":
            # Add the entityref to the output
            MyParser.handle_entityref(self, name)
        elif self.current_state == "toc":
            # Add the entityref to the TOC
            self.toc += ("&" + name + ";")

    def handle_decl(self, decl):
        """Overridden - Called when a SGML declaration (<!) is parsed

        Depending on the current state, the declaration is queued for output,
        or not."""
        if self.current_state == "body":
            # Add the SGML declaration to the output
            MyParser.handle_decl(self, decl)

    # Private functions
    def __update_state_machine_start__(self, tag, attrs):
        """Update the state machine."""
        state_changed = False

        if tag == 'div':
            if wikidot.debug.ENABLE_DEBUG == True:
                print >> sys.stderr, self.state, self.div_bookmark
            # Look for the id = xyz attribute
            for attr in attrs:
                for div_attr in self.div_state_map:
                    if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]):
                        # Match !
                        self.state.append(div_attr[2])
                        self.div_bookmark.append(self.div_level)
                        state_changed = True
                        break
            # Increment div level
            self.div_level += 1

        # Update the current state
        self.current_state = self.__get_current_state__()
        return state_changed

    def __update_state_machine_end__(self, tag):
        state_changed = False

        if tag == 'div':
            if wikidot.debug.ENABLE_DEBUG == True:
                print >> sys.stderr, self.state, self.div_bookmark
            self.div_level -= 1
            if self.div_level == self.div_bookmark[-1]:
                # Matching closing </div> tag -> pop the state
                self.state.pop()
                self.div_bookmark.pop()
                state_changed = True

        # Update the current state
        self.current_state = self.__get_current_state__()
        return state_changed

    def __get_current_state__(self):
        return self.state[-1]

    def __handle_body_tag__(self, tag, attrs):
        # Special case 1: links
        if tag == 'a':
            for index, attr in enumerate(attrs):
                if attr[0] == 'href':
                    # Register the link
                    self.links.add(attr[1])
                    break
        # Special case 2: images
        elif tag == 'img':
            for index, attr in enumerate(attrs):
                if attr[0] == 'src':
                    # Register the link
                    self.links.add(attr[1])
                elif attr[0] == 'width':
                    # Fix the width=xx attribute
                    # Wikidot gives width="600px", instead of width=600
                    pos = attr[1].find('px')
                    if pos >= 0:
                        attrs[index] = (attr[0], attr[1][0:pos])
Exemplo n.º 4
0
        print >>sys.stderr, "Creating output directory; ", outputdir
        os.mkdir(outputdir)

    # Fetch root page
    output = os.path.join(outputdir, urltoname(starturl))
    retval = fetchurl(starturl, output)
    newlinks = retval["links"]
    breadcrumbs = retval["breadcrumbs"]
    # get the last element of the list
    if len(breadcrumbs) > 0:
        breadcrumbs = breadcrumbs[len(breadcrumbs) - 1]
    else:
        breadcrumbs = ""

    # Create a set with fetched links (avoid loops...)
    links = OrderedSet(starturl)

    # Iterate on the links, and recursively download / convert
    fetchlinks = newlinks
    while len(fetchlinks) > 0:
        newlinks = OrderedSet()
        for url in fetchlinks:
            url = urlparse.urljoin(starturl, url)
            output = os.path.join(outputdir, urltoname(url))
            print >>sys.stderr, "\nProcessing ", url
            # Link on the same server?
            if urlparse.urlparse(url).netloc == urlparse.urlparse(starturl).netloc:
                retval = fetchurl(url, output, breadcrumbs)
                newlinks.update(retval["links"])
            else:
                print >>sys.stderr, "*** {} is not on the same server. Link skipped.".format(url)
Exemplo n.º 5
0
        print >> sys.stderr, "Creating output directory; ", outputdir
        os.mkdir(outputdir)

    # Fetch root page
    output = os.path.join(outputdir, urltoname(starturl))
    retval = fetchurl(starturl, output)
    newlinks = retval['links']
    breadcrumbs = retval['breadcrumbs']
    # get the last element of the list
    if len(breadcrumbs) > 0:
        breadcrumbs = breadcrumbs[len(breadcrumbs)-1]
    else:
        breadcrumbs = ''

    # Create a set with fetched links (avoid loops...)
    links = OrderedSet(starturl)

    # Iterate on the links, and recursively download / convert
    fetchlinks = newlinks
    while len(fetchlinks) > 0:
        newlinks = OrderedSet()
        for url in fetchlinks:
            url = urlparse.urljoin(starturl, url)
            output = os.path.join(outputdir, urltoname(url))
            print >> sys.stderr, "\nProcessing ", url
            # Link on the same server? If no match, search the list of alternative servers
            start_server = urlparse.urlparse(starturl).netloc
            link_server = urlparse.urlparse(url).netloc
            if (link_server == start_server or _get_alternate_server(link_server) == _get_alternate_server(start_server)):
                retval = fetchurl(url, output, breadcrumbs)
                newlinks.update(retval['links'])
Exemplo n.º 6
0
Arquivo: parser.py Projeto: Juxi/aseba
class WikidotParser(MyParser):
    """WikidotParser is used to clean a page from www.wikidot.com,
    keeping only the interesting content."""
    def __init__(self):
        """Intialize internal variables"""
        MyParser.__init__(self)
        self.div_level = 0
        self.div_bookmark = [-1]    # List managed as a stack
        self.state = ["none"]       # List managed as a stack
        self.current_state = "none" # Point to the top of the stack
        # map for div tag attribute -> state
        # (attribute name, attribute property, state)
        self.div_state_map = \
            [
            ('id', 'page-title', 'title'),
            ('id', 'breadcrumbs', 'breadcrumbs'),
            ('id', 'page-content', 'body'),
            ('id', 'toc-action-bar', 'useless'),
            ('id', 'toc', 'toc'),
            ('style','position:absolute', 'useless')]
        self.page_title = ""
        self.toc = ""
        self.links = OrderedSet()
        self.breadcrumbs = list()

    # Public interface
    def get_doc(self):
        """Retrieve the parsed and cleaned document"""
        # format the TOC
        if self.toc != "":
            self.toc = """<table id="toc-table" summary="TOC"><tr><td>""" + self.toc
            self.toc += "</td></tr></table>"
        # Add header
        header_template = Template(header)
        self.out_doc = header_template.substitute(title=self.page_title, toc=self.toc) + self.out_doc
        # Add footer
        self.out_doc += footer
        return self.out_doc

    def get_links(self):
        """Retrieve the links embedded in the page (including images)"""
        return self.links

    def get_title(self):
        return self.page_title

    def get_breadcrumbs(self):
        return self.breadcrumbs

    # Inherited functions
    def handle_starttag(self, tag, attrs):
        """Overridden - Called when a start tag is parsed

        The heart of this function is the state machine.
        When a <div> tag is detected, the attributes are compared with
        a map of the form (name,value) -> state. If a match occurs,
        the state is pushed on top of the stack.

        Depending on the current state, the start tag is queued for output,
        or not."""
        # Debug
        if wikidot.debug.ENABLE_DEBUG == True:
            print >> sys.stderr, "<{}> {}".format(tag, attrs)

        # Update the state machine
        state_changed = self.__update_state_machine_start__(tag, attrs)

        if (state_changed == True) and (self.current_state == "body"):
            # We have just entered the body, don't output this <div> tag
            return
        if self.current_state == "body":
            # Handle special tags
            self.__handle_body_tag__(tag, attrs)
            # Add the tag to output
            MyParser.handle_starttag(self, tag, attrs)
        elif self.current_state == "toc":
            # Handle the content of the TOC
            self.toc += MyParser.format_start_tag(self, tag, attrs)
        elif (self.current_state == "breadcrumbs") and (tag == 'a'):
            # Register the breadcrumbs
            for attr in attrs:
                if (attr[0] == 'href'):
                    self.breadcrumbs.append(attr[1])
                    break

    def handle_endtag(self, tag):
        """Overridden - Called when an end tag is parsed

        The state machine is updated when a </div> tag is encountered.
        Depending on the current state, the end tag is queued for output,
        or not."""
        if self.current_state == "toc":
            # Add the tag to the TOC
            self.toc += MyParser.format_end_tag(self, tag)

        # Update the state machine
        state_changed = self.__update_state_machine_end__(tag)
        if state_changed == True:
            return

        if self.current_state == "body":
            # Add the tag to output
            MyParser.handle_endtag(self, tag)

    def handle_data(self, data):
        """Overridden - Called when some data is parsed

        Depending on the current state, the data is queued for output,
        or not."""
        if self.current_state == "title":
            # Register the title
            self.page_title += data.strip()
        elif self.current_state == "body":
            # Add data to the output
            MyParser.handle_data(self, data)
        elif self.current_state == "toc":
            # Add data to the TOC
            self.toc += data

    def handle_charref(self, name):
        """Overridden - Called when a charref (&#xyz) is parsed

        Depending on the current state, the charref is queued for output,
        or not."""
        if self.current_state == "title":
            # Add charref to the title
            self.page_title += ("&#" + name + ";")
        elif self.current_state == "body":
            # Add charref to the output
            MyParser.handle_charref(self, name)
        elif self.current_state == "toc":
            # Add charref to the TOC
            self.toc += ("&#" + name + ";")

    def handle_entityref(self, name):
        """Overridden - Called when an entityref (&xyz) tag is parsed

        Depending on the current state, the entityref is queued for output,
        or not."""
        if self.current_state == "title":
            # Add the entityref to the title
            self.page_title += ("&" + name + ";")
        elif self.current_state == "body":
            # Add the entityref to the output
            MyParser.handle_entityref(self, name)
        elif self.current_state == "toc":
            # Add the entityref to the TOC
            self.toc += ("&" + name + ";")

    def handle_decl(self, decl):
        """Overridden - Called when a SGML declaration (<!) is parsed

        Depending on the current state, the declaration is queued for output,
        or not."""
        if self.current_state == "body":
            # Add the SGML declaration to the output
            MyParser.handle_decl(self, decl)

    # Private functions
    def __update_state_machine_start__(self, tag, attrs):
        """Update the state machine."""
        state_changed = False

        if tag == 'div':
            if wikidot.debug.ENABLE_DEBUG == True:
                print >> sys.stderr, self.state, self.div_bookmark
            # Look for the id = xyz attribute
            for attr in attrs:
                for div_attr in self.div_state_map:
                    if (div_attr[0] == attr[0]) and (div_attr[1] in attr[1]):
                        # Match !
                        self.state.append(div_attr[2])
                        self.div_bookmark.append(self.div_level)
                        state_changed = True
                        break
            # Increment div level
            self.div_level += 1

        # Update the current state
        self.current_state = self.__get_current_state__()
        return state_changed

    def __update_state_machine_end__(self, tag):
        state_changed = False

        if tag == 'div':
            if wikidot.debug.ENABLE_DEBUG == True:
                print >> sys.stderr, self.state, self.div_bookmark
            self.div_level -= 1
            if self.div_level == self.div_bookmark[-1]:
                # Matching closing </div> tag -> pop the state
                self.state.pop()
                self.div_bookmark.pop()
                state_changed = True

        # Update the current state
        self.current_state = self.__get_current_state__()
        return state_changed

    def __get_current_state__(self):
        return self.state[-1]

    def __handle_body_tag__(self, tag, attrs):
        # Special case 1: links
        if tag == 'a':
            for index, attr in enumerate(attrs):
                if attr[0] == 'href':
                    # Register the link
                    self.links.add(attr[1])
                    break
        # Special case 2: images
        elif tag == 'img':
            for index, attr in enumerate(attrs):
                if attr[0] == 'src':
                    # Register the link
                    self.links.add(attr[1])
                elif attr[0] == 'width':
                    # Fix the width=xx attribute
                    # Wikidot gives width="600px", instead of width=600
                    pos = attr[1].find('px')
                    if pos >= 0:
                        attrs[index] = (attr[0], attr[1][0:pos])