Exemplo n.º 1
0
 def __init__(self, url, verbose=0):
     "Initialise an object, passing 'verbose' to the superclass."
     HTMLParser.__init__(self)
     self.hyperlinks = []
     self.url = url
     self.language = detect_language(url)
     self.pdf = PDFWriter(
         urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4)
     header = Header(text_align=pango.ALIGN_CENTER)
     #TODO Alignment not working.
     header.set_text(urllib.unquote(self.url))
     self.pdf.set_header(header)
     self.pdf.move_context(0, 500)
     h1 = Text(urllib.unquote(self.url.split("/")[-1]),
               font="serif",
               font_size=32)
     h1.color = StandardColors.Blue
     self.pdf.add_text(h1)
     h2 = Text(urllib.unquote(self.url), font="serif", font_size=16)
     h2.color = StandardColors.Blue
     self.pdf.add_text(h2)
     footer = Footer(text_align=pango.ALIGN_CENTER)
     footer.set_text("wiki2pdf")
     self.pdf.set_footer(footer)
     self.pdf.page_break()
Exemplo n.º 2
0
 def __init__(self, url,filename=None,path=None,verbose=0):
     "Initialise an object, passing 'verbose' to the superclass."
     HTMLParser.__init__(self)
     self.hyperlinks = []
     self.url = url
     self.language = detect_language(url)
     if filename is None:
         filename = urllib.unquote(self.url.split("/")[-1]) + ".pdf"
     if path is None:
         path = os.getcwd()
     fullpath = os.path.join(path, filename)
     self.pdf = PDFWriter(fullpath, StandardPaper.A4)
     header = Header(text_align=pango.ALIGN_CENTER)
     #TODO Alignment not working.
     header.set_text(urllib.unquote(self.url))
     self.pdf.set_header(header)
     self.pdf.move_context(0, 500)
     h1 = Text(urllib.unquote(self.url.split("/")[-1]), font="serif", font_size=32)
     h1.color = StandardColors.Blue
     self.pdf.add_text(h1)
     h2 = Text(urllib.unquote(self.url), font="serif", font_size=16)
     h2.color = StandardColors.Blue
     self.pdf.add_text(h2)
     footer = Footer(text_align=pango.ALIGN_CENTER)
     footer.set_text("wiki2pdf")
     self.pdf.set_footer(footer)
     self.pdf.page_break()
Exemplo n.º 3
0
 def __init__(self, inputfile, filename, font="serif", verbose=0):
     "Initialise an object, passing 'verbose' to the superclass."
     HTMLParser.__init__(self)
     self.hyperlinks = []
     self.inputfile = inputfile
     self.inputfile = os.path.join(os.path.dirname(__file__), self.inputfile)
     f = file(self.inputfile)
     url = f.readline()
     f.close()
     self.font = font
     self.language = detect_language(url)
     tmp_folder = os.path.join(os.path.dirname(__file__), "tmp")
     self.pdf = PDFWriter(os.path.join(tmp_folder, filename), StandardPaper.A4)
     header = Header(text_align=pango.ALIGN_CENTER)
     # TODO Alignment not working.
     header.set_text(urllib.unquote(url))
     self.pdf.set_header(header)
     self.pdf.move_context(0, 500)
     h1 = Text(urllib.unquote(url.split("/")[-1]), font=self.font, font_size=32)
     h1.color = StandardColors.Blue
     self.pdf.add_text(h1)
     h2 = Text(urllib.unquote(url), font=self.font, font_size=16)
     h2.color = StandardColors.Blue
     self.pdf.add_text(h2)
     footer = Footer(text_align=pango.ALIGN_CENTER)
     footer.set_text("wiki2book")
     self.pdf.set_footer(footer)
     self.pdf.page_break()
Exemplo n.º 4
0
 def parse(self, filename, outputfile):
     try:
         text = codecs.open(filename, 'r', 'utf-8').read()
     except IOError: # given filename could not be found
         return ''
     parts = publish_parts(text, writer=Writer(), settings_overrides=SETTINGS)
     if 'html_body' in parts:
         html = parts['html_body']
     "Parse the given string 's'."
     self.pdf = PDFWriter(outputfile, StandardPaper.A4)
     footer = Footer()
     header = Header()
     self.pdf.set_footer(footer)
     self.pdf.set_header(header)
     self.feed(html)
     self.close()
     self.pdf.flush()
Exemplo n.º 5
0
 def __init__(self, url, verbose=0):
     "Initialise an object, passing 'verbose' to the superclass."
     SGMLParser.__init__(self, verbose)
     self.hyperlinks = []
     self.url = url
     self.pdf = PDFWriter(self.url.split("/")[-1] +".pdf",595, 842)
     header = Header(text_align = pango.ALIGN_CENTER)
     #TODO Alignment not working.
     header.set_text(self.url)
     self.pdf.set_header(header)
     h1= Text(self.url.split("/")[-1],font="Dyuthi",font_size=32) 
     self.pdf.add_h1(h1)
     h2= Text(self.url,font="Rachan",font_size=16) 
     self.pdf.add_h2(h2)
     footer = Footer(text_align = pango.ALIGN_CENTER)
     footer.set_text("wiki2pdf")
     self.pdf.set_footer(footer)
     self.pdf.page_break()
Exemplo n.º 6
0
 def __init__(self, url, verbose=0):
     "Initialise an object, passing 'verbose' to the superclass."
     SGMLParser.__init__(self, verbose)
     self.hyperlinks = []
     self.url = url
     self.language = detect_language(url)
     self.pdf = PDFWriter(self.url.split("/")[-1] + ".pdf", StandardPaper.A4)
     header = Header(text_align=pango.ALIGN_CENTER)
     # TODO Alignment not working.
     header.set_text(self.url)
     self.pdf.set_header(header)
     self.pdf.move_context(0, 500)
     h1 = Text(self.url.split("/")[-1], font="Serif", font_size=32)
     self.pdf.add_text(h1)
     h2 = Text(self.url, font="Serif", font_size=16)
     self.pdf.add_text(h2)
     footer = Footer(text_align=pango.ALIGN_CENTER)
     footer.set_text("wiki2pdf")
     self.pdf.set_footer(footer)
     self.pdf.page_break()
Exemplo n.º 7
0
 def reset(self):                              
     SGMLParser.reset(self)
     self.images = []
     #TODO make the output file configurable- take it from command line
     self.pdf = PDFWriter("output.pdf",595, 842)
     header = Header(text_align = pango.ALIGN_CENTER)
     #TODO Alignment not working.
     header.set_text("A wikipedia article")
     self.pdf.set_header(header)
     footer = Footer(text_align = pango.ALIGN_CENTER)
     footer.set_text("wiki2pdf")
     #TODO Alignment not working.
     self.pdf.set_footer(footer)
     self.h1 = False
     self.h2 = False
     self.li = False
     self.p = False
     self.a = False
     self.ul = False
     self.ol = False
     self.span = False
     self.buffer = None
Exemplo n.º 8
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.

import sys

sys.path.append("../src/")  # not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__ == "__main__":
    pdf = PDFWriter("scripts.pdf", StandardPaper.A4)
    header = Header(text_align=pango.ALIGN_LEFT)
    header.set_text("ടെസ്റ്റ് തലക്കെട്ട്")
    pdf.set_header(header)
    footer = Footer(text_align=pango.ALIGN_LEFT)
    footer.set_text("test footer")
    pdf.set_footer(footer)
    h1 = Text("Samples", font_size=16)
    pdf.add_text(h1)
    h2 = Text("Malayalam", font_size=14)
    h2.color = StandardColors.Blue
    pdf.add_text(h2)

    para_file_malayalam = open("malayalam.txt")
    # image = Image(image_file="Four_Sons_of_Dasaratha.png")
    # pdf.add_image(image)
Exemplo n.º 9
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.

import sys
sys.path.append("../src/")  #not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__=="__main__":
    pdf = PDFWriter("tables.pdf",StandardPaper.A4)
    header = Header(text_align = pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    table = Table(border_width=1)
    table.cell_padding = [2, 2, 2, 2]
    row = Row(height=100)
    for i in range(4):
        cell_content = Text("SampleCell "+str(i),font_size=14)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
Exemplo n.º 10
0
class Wikiparser(SGMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.url = url
        self.pdf = PDFWriter(self.url.split("/")[-1] +".pdf",595, 842)
        header = Header(text_align = pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(self.url)
        self.pdf.set_header(header)
        h1= Text(self.url.split("/")[-1],font="Dyuthi",font_size=32) 
        self.pdf.add_h1(h1)
        h2= Text(self.url,font="Rachan",font_size=16) 
        self.pdf.add_h2(h2)
        footer = Footer(text_align = pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()
        
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key=='src'] 
        if src:
            self.images.extend(src)
            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font="Dyuthi",font_size=16) 
        self.pdf.add_h1(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font="Rachana",font_size=14) 
            self.pdf.add_h2(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_li(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="Rachana",font_size=10,)
        para.set_justify(True)
        para.language = "ml_IN"
        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)   
        self.buffer = None
    def set_header(self,text):
        self.header = text
        
    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
        "Parse the given string 's'."
        self.feed(page)
        self.close()
Exemplo n.º 11
0
class Wikiparser2(HTMLParser):
    def __init__(self, inputfile, filename, font="serif", verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        HTMLParser.__init__(self)
        self.hyperlinks = []
        self.inputfile = inputfile
        self.inputfile = os.path.join(os.path.dirname(__file__), self.inputfile)
        f = file(self.inputfile)
        url = f.readline()
        f.close()
        self.font = font
        self.language = detect_language(url)
        tmp_folder = os.path.join(os.path.dirname(__file__), "tmp")
        self.pdf = PDFWriter(os.path.join(tmp_folder, filename), StandardPaper.A4)
        header = Header(text_align=pango.ALIGN_CENTER)
        # TODO Alignment not working.
        header.set_text(urllib.unquote(url))
        self.pdf.set_header(header)
        self.pdf.move_context(0, 500)
        h1 = Text(urllib.unquote(url.split("/")[-1]), font=self.font, font_size=32)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        h2 = Text(urllib.unquote(url), font=self.font, font_size=16)
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align=pango.ALIGN_CENTER)
        footer.set_text("wiki2book")
        self.pdf.set_footer(footer)
        self.pdf.page_break()

    def reset(self):
        HTMLParser.reset(self)
        self.images = []
        # TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.reference = False
        self.ref_counter = 0
        self.buffer = None
        self.sup = False

    def handle_data(self, data):
        if data.strip() == "":
            return
        if self.p or self.h1 or self.h2 or self.a or self.span or self.li:
            if self.buffer != None:
                self.buffer += data

    def handle_starttag(self, tag, attrs):
        if tag == "img":
            self.start_img(attrs)
        elif tag == "h1":
            self.start_h1(attrs)
        elif tag == "h2":
            self.start_h2(attrs)
        elif tag == "li":
            self.start_li(attrs)
        elif tag == "p":
            self.start_p(attrs)
        elif tag == "a":
            self.start_a(attrs)
        elif tag == "ul":
            self.start_ul(attrs)
        elif tag == "ol":
            self.start_ol(attrs)
        elif tag == "span":
            self.start_span(attrs)
        elif (
            tag == "sup"
            or tag == "sub"
            or tag == "b"
            or tag == "i"
            or tag == "s"
            or tag == "small"
            or tag == "big"
            or tag == "tt"
            or tag == "u"
        ):
            if self.reference == False:
                if self.buffer != None:
                    self.buffer += "<" + tag + ">"
                    self.sup = True

    def handle_endtag(self, tag):
        if tag == "img":
            self.end_img()
        elif tag == "h1":
            self.end_h1()
        elif tag == "h2":
            self.end_h2()
        elif tag == "li":
            self.end_li()
        elif tag == "p":
            self.end_p()
        elif tag == "a":
            self.end_a()
        elif tag == "ul":
            self.end_ul()
        elif tag == "ol":
            self.end_ol()
        elif tag == "span":
            self.end_span()
        elif (
            tag == "sup"
            or tag == "sub"
            or tag == "b"
            or tag == "i"
            or tag == "s"
            or tag == "small"
            or tag == "big"
            or tag == "tt"
            or tag == "u"
        ):
            if self.sup and self.buffer != None:
                self.buffer += "</" + str(tag) + ">"

    def start_img(self, attrs):
        src = [value for key, value in attrs if key == "src"]
        if src:
            self.images.extend(src)

    def end_img(self):
        for wiki_image in self.images:
            image = Image()
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []

    def start_h1(self, attrs):
        self.h1 = True
        self.buffer = ""

    def end_h1(self):
        self.h1 = False
        h1 = Text(self.buffer, font=self.font, font_size=16)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None

    def start_h2(self, attrs):
        self.h2 = True
        self.buffer = ""

    def end_h2(self):
        self.h2 = False
        if self.buffer and self.buffer.strip() > "":
            h2 = Text(self.buffer, font=self.font, font_size=14)
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None

    def start_li(self, attrs):
        self.li = True
        self.buffer = ""

    def end_li(self):
        self.li = False
        #        print self.buffer
        if self.buffer and self.buffer.strip() > "":
            if self.ul:
                li = Text(markup="• " + self.buffer, font=self.font, font_size=10)
            elif self.ol:
                self.ref_counter += 1
                li = Text(
                    markup=str(self.ref_counter) + ". " + self.buffer.replace("↑", ""), font=self.font, font_size=10
                )
            else:
                li = Text(markup=self.buffer, font=self.font, font_size=10)
            self.pdf.add_text(li)
        self.buffer = None

    def start_a(self, attrs):
        self.a = True

    def end_a(self):
        self.a = False

    def start_ol(self, attrs):
        self.ol = True
        for tups in attrs:
            if "class" in tups:
                if tups[1] == "references":
                    self.reference = True

    def end_ol(self):
        self.ol = False
        self.ref_counter = 0
        if self.reference:
            self.reference = False
            # self.sup = False

    def start_ul(self, attrs):
        self.ul = True

    def end_ul(self):
        self.ul = False

    def start_span(self, attrs):
        self.span = True
        if self.buffer == None:
            self.buffer = ""

    def end_span(self):
        self.buffer += ""
        self.span = False

    def start_p(self, attrs):
        self.p = True
        self.buffer = ""

    def end_p(self):
        self.p = False
        if self.sup:
            para = Paragraph(markup=self.buffer, text=self.buffer, font=self.font, font_size=10)
            self.sup = False
        else:
            # print self.buffer
            para = Paragraph(text=self.buffer, font=self.font, font_size=10)

        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None

        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)
        self.buffer = None

    def set_header(self, text):
        self.header = text

    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link = "https:" + imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts) - 1]
            output_filename = os.path.join(outputfolder, filename)
            # output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [("User-agent", "Mozilla/5.0")]
            infile = opener.open(link)
            page = infile.read()
            f = open(output_filename, "w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return output_filename

    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [("User-agent", "Mozilla/5.0")]
        f = file(self.inputfile)
        while True:
            url = f.readline()
            if len(url) == 0:
                break
            infile = opener.open(url)
            page = infile.read()
            page = cleanup(page)
            "Parse the given string 's'."
            self.feed(page)
            self.pdf.page_break()
        self.close()
        self.pdf.flush()
Exemplo n.º 12
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.

import sys
sys.path.append("../src/")  #not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__=="__main__":
    pdf = PDFWriter("scripts.pdf",StandardPaper.A4)
    header = Header(text_align = pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    h1= Text("Samples",font_size=16) 
    pdf.add_text(h1)
    h2= Text("Malayalam",font_size=14) 
    h2.color = StandardColors.Blue
    pdf.add_text(h2)
    
    para_file_malayalam=open("malayalam.txt")
Exemplo n.º 13
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.

import sys
sys.path.append("../src/")  #not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__=="__main__":
    pdf = PDFWriter("image.pdf",StandardPaper.A4)
    header = Header(text_align = pango.ALIGN_CENTER)
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    pdf.set_footer(footer)
    image  = Image()  
    image.set_image_file("White_peacock.jpg")
    pdf.add_image(image,0.25)
    pdf.flush()
    """
    table = Table(border_width=1)
    row = Row(height=50)
    for i in range(4):
        cell = Cell("SampleCell "+str(i),font_size=8,width=100)
Exemplo n.º 14
0
class Wikiparser(SGMLParser):
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []
        #TODO make the output file configurable- take it from command line
        self.pdf = PDFWriter("output.pdf",595, 842)
        header = Header(text_align = pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text("A wikipedia article")
        self.pdf.set_header(header)
        footer = Footer(text_align = pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        #TODO Alignment not working.
        self.pdf.set_footer(footer)
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key=='src'] 
        if src:
            self.images.extend(src)
            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font_size=16) 
        self.pdf.add_h1(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font_size=14) 
            self.pdf.add_h2(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_li(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="Rachana",font_size=10,)
        para.set_justify(True)
        self.pdf.add_paragraph(para)   
        self.buffer = None
Exemplo n.º 15
0
class HTMLParser(SGMLParser):
    def __init__(self, verbose=0):

        "Initialise an object, passing 'verbose' to the superclass."

        SGMLParser.__init__(self, verbose)
        self.hyperlinks = []

        self.pdf = None
        
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []

        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key=='src'] 
        if src:
            self.images.extend(src)
            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font="Serif",font_size=16) 
        self.pdf.add_text(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font="Serif",font_size=14) 
            self.pdf.add_text(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_text(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="Serif",font_size=10,)
        para.set_justify(True)
        para.set_hyphenate(False)
        self.pdf.add_paragraph(para)   
        self.buffer = None

    def parse(self, filename, outputfile):
        try:
            text = codecs.open(filename, 'r', 'utf-8').read()
        except IOError: # given filename could not be found
            return ''
        parts = publish_parts(text, writer=Writer(), settings_overrides=SETTINGS)
        if 'html_body' in parts:
            html = parts['html_body']
        "Parse the given string 's'."
        self.pdf = PDFWriter(outputfile, StandardPaper.A4)
        footer = Footer()
        header = Header()
        self.pdf.set_footer(footer)
        self.pdf.set_header(header)
        self.feed(html)
        self.close()
        self.pdf.flush()
Exemplo n.º 16
0
class Wikiparser(HTMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        HTMLParser.__init__(self)
        self.hyperlinks = []
        self.url = url
        self.language = detect_language(url)
        self.pdf = PDFWriter(urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4)
        header = Header(text_align=pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(urllib.unquote(self.url))
        self.pdf.set_header(header) 
        self.pdf.move_context(0, 500)
        h1 = Text(urllib.unquote(self.url.split("/")[-1]), font="serif", font_size=32) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        h2 = Text(urllib.unquote(self.url), font="serif", font_size=16) 
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align=pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()
        
    def reset(self):                              
        HTMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.table = False
        self.tr = False
        self.th = False
        self.td = False
        self.caption = False
        self.reference = False
	self.ref_counter = 0
        self.column_counter = 0
        self.current_counter = 0
        self.buffer = None
        self.sup = False
        
    def handle_data(self, data):
        if data.strip() == "": return
	if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption:
            if self.buffer != None:
                self.buffer += data
    def handle_starttag(self, tag, attrs):
        if tag == 'img'and not self.table:
            self.start_img(attrs)
        elif tag == 'h1':
            self.start_h1(attrs)
        elif tag == 'h2':
            self.start_h2(attrs)
        elif tag == 'li':
            self.start_li(attrs)
        elif tag == 'p':
            self.start_p(attrs)
        elif tag == 'a':
            self.start_a(attrs)
        elif tag == 'ul':
            self.start_ul(attrs)
        elif tag == 'ol':
            self.start_ol(attrs)
        elif tag == 'table':
            self.start_table(attrs)
        elif tag == 'tr' and self.table:
            self.start_tr(attrs)
        elif tag == 'td' and self.table:
            self.start_td(attrs)
        elif tag == 'th'and self.table:
            self.start_th(attrs)
        elif tag == 'caption' and self.table:
            self.start_caption(attrs)
        elif tag == 'span':
	    self.start_span(attrs)
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.reference == False and self.table == False:
               if self.buffer != None:
                  self.buffer += "<"+tag+">"
                  self.sup = True


    def handle_endtag(self, tag):
        if tag == 'img' and not self.table:
            self.end_img()
        elif tag == 'h1':
            self.end_h1()
        elif tag == 'h2':
            self.end_h2()
        elif tag == 'li':
            self.end_li()
        elif tag == 'p':
            self.end_p()
        elif tag == 'a':
            self.end_a()
        elif tag == 'ul':
            self.end_ul()
        elif tag == 'ol':
            self.end_ol()
        elif tag == 'table':
            self.end_table()
        elif tag == 'tr' and self.table:
            self.end_tr()
        elif tag == 'td' and self.table:
            self.end_td()
        elif tag == 'th' and self.table:
            self.end_th()
        elif tag == 'caption' and self.table:
            self.end_caption()
        elif tag == 'span':
            self.end_span()
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.sup and self.buffer != None:
                self.buffer += "</"+str(tag)+">"
        

    def start_img(self, attrs):         
        src = [value for key, value in attrs if key == 'src'] 
        if src:
            self.images.extend(src)
            
    def end_img(self):
        for wiki_image in self.images:
            image  = Image()  
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []
        
    def start_h1(self, attrs):         
        self.h1 = True
        self.buffer = ""
        
    def end_h1(self):
        self.h1 = False
        h1 = Text(self.buffer, font="FreeSerif", font_size=16) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2 = True
        self.buffer = ""
        
    def end_h2(self):
        self.h2 = False
        if self.buffer and self.buffer.strip() > "":
            h2 = Text(self.buffer, font="FreeSerif", font_size=14) 
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None
        
    def start_caption(self, attrs):         
        self.caption = True
        self.buffer = ""
        
    def end_caption(self):
        self.caption = False
        if self.buffer and self.buffer.strip() > "":
            caption = Text(self.buffer, font="FreeSerif", font_size=14) 
            caption.color = StandardColors.Blue
            self.pdf.add_text(caption)
        self.buffer = None

    def start_li(self, attrs):         
        self.li = True
        self.buffer = ""
        
    def end_li(self):
        self.li = False
#        print self.buffer
        if self.buffer and self.buffer.strip() > "":
            if self.ul:
                li = Text(markup = "• " + self.buffer,font="FreeSerif", font_size=10)
            elif self.ol:
                self.ref_counter+=1
                li = Text(markup = str(self.ref_counter) + ". "+ self.buffer.replace("↑",""), font = "FreeSerif", font_size=10)
            else:
                li = Text(markup = self.buffer,font="FreeSerif", font_size=10)     
            self.pdf.add_text(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False

    def start_table(self, attrs): 
        for tups in attrs:
	    if 'class' in tups:
		if tups[1] == 'wikitable':
                    self.table = True
                    self.wikitable = Table(border_width = 1)
                    self.wikitable.cell_padding = [2,2,2,2]
        
    def end_table(self):
        if self.table:
            self.table = False
            self.pdf.add_table(self.wikitable)

    def start_tr(self, attrs):         
        self.tr = True
        self.row = Row(height=25)
        self.current_counter = 0
        
    def end_tr(self):
        self.tr = False
        if self.current_counter == self.column_counter:
            self.wikitable.add_row(self.row)

    def start_td(self, attrs):         
        self.td = True
        self.buffer = ""
        
    def end_td(self):
        self.td = False
#        print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer,font_size=10)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
        self.row.add_cell(cell)
        self.current_counter+=1
        self.buffer = None

    def start_th(self, attrs):         
        self.th = True
        self.buffer = ""
        
    def end_th(self):
        self.th = False
 #       print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer,font_size=10)
        cell_content.color = Color(0.0,0.0,0.0,1.0)
        cell = Cell(cell_content, font_size=8,width=100)
        self.row.add_cell(cell)
        self.column_counter+=1
        self.current_counter+=1
        self.buffer = None
    
#    def start_sup(self, attrs):         
#        self.sup = True
#        self.buffer += "<sup>"
#        
#    def end_sup(self):
#        print "test"
#        self.buffer += "</sup>"

        
    def start_ol(self, attrs):
        self.ol = True
        for tups in attrs:
	    if 'class' in tups:
		if tups[1] == 'references':
                    self.reference = True

    def end_ol(self):
        self.ol = False
        self.ref_counter = 0
        if self.reference:
            self.reference= False
            #self.sup = False
        
    def start_ul(self, attrs):
        self.ul = True    
    def end_ul(self):
        self.ul = False
            
    def start_span(self, attrs):         
        self.span = True
        if self.buffer == None:
            self.buffer = ""  
        
    def end_span(self):
        self.buffer += " "
        self.span = False
            
    def start_p(self, attrs):
        self.p = True
        self.buffer = ""
        
    def end_p(self) :
        self.p = False
        if self.sup:
            para = Paragraph(markup=self.buffer,text = self.buffer, font="FreeSerif", font_size=10,)
            self.sup = False
        else:
            #print self.buffer
            para = Paragraph(text=self.buffer, font="FreeSerif", font_size=10,)
           
        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None
            
        para.set_hyphenate(True)
        self.pdf.add_paragraph(para) 
#        f= open("computer_para.txt","aw")
#        f.write(self.buffer)
#        f.write("\n")
#        f.close()  
        self.buffer = None
    def set_header(self, text):
        self.header = text

    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link= imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts)-1]
            output_filename = os.path.join(outputfolder , filename)
            #output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            infile = opener.open(link)
            page = infile.read()
            f= open(output_filename,"w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return  output_filename
    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
#        f= open("computer.txt","w")
#        f.write(page)
#        f.close()
#        f = open("computer.txt","r")
#        page=f.read()
#        f.close()
        "Parse the given string 's'."
        self.feed(page)
        self.close()
        self.pdf.flush()
Exemplo n.º 17
0
class Wikiparser(HTMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        HTMLParser.__init__(self)
        self.hyperlinks = []
        self.url = url
        self.language = detect_language(url)
        self.pdf = PDFWriter(
            urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4)
        header = Header(text_align=pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(urllib.unquote(self.url))
        self.pdf.set_header(header)
        self.pdf.move_context(0, 500)
        h1 = Text(urllib.unquote(self.url.split("/")[-1]),
                  font="serif",
                  font_size=32)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        h2 = Text(urllib.unquote(self.url), font="serif", font_size=16)
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align=pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()

    def reset(self):
        HTMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.table = False
        self.tr = False
        self.th = False
        self.td = False
        self.caption = False
        self.reference = False
        self.ref_counter = 0
        self.column_counter = 0
        self.current_counter = 0
        self.buffer = None
        self.sup = False

    def handle_data(self, data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption:
            if self.buffer != None:
                self.buffer += data

    def handle_starttag(self, tag, attrs):
        if tag == 'img' and not self.table:
            self.start_img(attrs)
        elif tag == 'h1':
            self.start_h1(attrs)
        elif tag == 'h2':
            self.start_h2(attrs)
        elif tag == 'li':
            self.start_li(attrs)
        elif tag == 'p':
            self.start_p(attrs)
        elif tag == 'a':
            self.start_a(attrs)
        elif tag == 'ul':
            self.start_ul(attrs)
        elif tag == 'ol':
            self.start_ol(attrs)
        elif tag == 'table':
            self.start_table(attrs)
        elif tag == 'tr' and self.table:
            self.start_tr(attrs)
        elif tag == 'td' and self.table:
            self.start_td(attrs)
        elif tag == 'th' and self.table:
            self.start_th(attrs)
        elif tag == 'caption' and self.table:
            self.start_caption(attrs)
        elif tag == 'span':
            self.start_span(attrs)
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.reference == False and self.table == False:
                if self.buffer != None:
                    self.buffer += "<" + tag + ">"
                    self.sup = True

    def handle_endtag(self, tag):
        if tag == 'img' and not self.table:
            self.end_img()
        elif tag == 'h1':
            self.end_h1()
        elif tag == 'h2':
            self.end_h2()
        elif tag == 'li':
            self.end_li()
        elif tag == 'p':
            self.end_p()
        elif tag == 'a':
            self.end_a()
        elif tag == 'ul':
            self.end_ul()
        elif tag == 'ol':
            self.end_ol()
        elif tag == 'table':
            self.end_table()
        elif tag == 'tr' and self.table:
            self.end_tr()
        elif tag == 'td' and self.table:
            self.end_td()
        elif tag == 'th' and self.table:
            self.end_th()
        elif tag == 'caption' and self.table:
            self.end_caption()
        elif tag == 'span':
            self.end_span()
        elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u':
            if self.sup and self.buffer != None:
                self.buffer += "</" + str(tag) + ">"

    def start_img(self, attrs):
        src = [value for key, value in attrs if key == 'src']
        if src:
            self.images.extend(src)

    def end_img(self):
        for wiki_image in self.images:
            image = Image()
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []

    def start_h1(self, attrs):
        self.h1 = True
        self.buffer = ""

    def end_h1(self):
        self.h1 = False
        h1 = Text(self.buffer, font="FreeSerif", font_size=16)
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None

    def start_h2(self, attrs):
        self.h2 = True
        self.buffer = ""

    def end_h2(self):
        self.h2 = False
        if self.buffer and self.buffer.strip() > "":
            h2 = Text(self.buffer, font="FreeSerif", font_size=14)
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None

    def start_caption(self, attrs):
        self.caption = True
        self.buffer = ""

    def end_caption(self):
        self.caption = False
        if self.buffer and self.buffer.strip() > "":
            caption = Text(self.buffer, font="FreeSerif", font_size=14)
            caption.color = StandardColors.Blue
            self.pdf.add_text(caption)
        self.buffer = None

    def start_li(self, attrs):
        self.li = True
        self.buffer = ""

    def end_li(self):
        self.li = False
        #        print self.buffer
        if self.buffer and self.buffer.strip() > "":
            if self.ul:
                li = Text(markup="• " + self.buffer,
                          font="FreeSerif",
                          font_size=10)
            elif self.ol:
                self.ref_counter += 1
                li = Text(markup=str(self.ref_counter) + ". " +
                          self.buffer.replace("↑", ""),
                          font="FreeSerif",
                          font_size=10)
            else:
                li = Text(markup=self.buffer, font="FreeSerif", font_size=10)
            self.pdf.add_text(li)
        self.buffer = None

    def start_a(self, attrs):
        self.a = True

    def end_a(self):
        self.a = False

    def start_table(self, attrs):
        for tups in attrs:
            if 'class' in tups:
                if tups[1] == 'wikitable':
                    self.table = True
                    self.wikitable = Table(border_width=1)
                    self.wikitable.cell_padding = [2, 2, 2, 2]

    def end_table(self):
        if self.table:
            self.table = False
            self.pdf.add_table(self.wikitable)

    def start_tr(self, attrs):
        self.tr = True
        self.row = Row(height=25)
        self.current_counter = 0

    def end_tr(self):
        self.tr = False
        if self.current_counter == self.column_counter:
            self.wikitable.add_row(self.row)

    def start_td(self, attrs):
        self.td = True
        self.buffer = ""

    def end_td(self):
        self.td = False
        #        print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer, font_size=10)
        cell_content.color = Color(0.0, 0.0, 0.0, 1.0)
        cell = Cell(cell_content, font_size=8, width=100)
        self.row.add_cell(cell)
        self.current_counter += 1
        self.buffer = None

    def start_th(self, attrs):
        self.th = True
        self.buffer = ""

    def end_th(self):
        self.th = False
        #       print self.buffer + " " + str(len(self.buffer))
        cell_content = Text(self.buffer, font_size=10)
        cell_content.color = Color(0.0, 0.0, 0.0, 1.0)
        cell = Cell(cell_content, font_size=8, width=100)
        self.row.add_cell(cell)
        self.column_counter += 1
        self.current_counter += 1
        self.buffer = None


#    def start_sup(self, attrs):
#        self.sup = True
#        self.buffer += "<sup>"
#
#    def end_sup(self):
#        print "test"
#        self.buffer += "</sup>"

    def start_ol(self, attrs):
        self.ol = True
        for tups in attrs:
            if 'class' in tups:
                if tups[1] == 'references':
                    self.reference = True

    def end_ol(self):
        self.ol = False
        self.ref_counter = 0
        if self.reference:
            self.reference = False
            #self.sup = False

    def start_ul(self, attrs):
        self.ul = True

    def end_ul(self):
        self.ul = False

    def start_span(self, attrs):
        self.span = True
        if self.buffer == None:
            self.buffer = ""

    def end_span(self):
        self.buffer += " "
        self.span = False

    def start_p(self, attrs):
        self.p = True
        self.buffer = ""

    def end_p(self):
        self.p = False
        if self.sup:
            para = Paragraph(
                markup=self.buffer,
                text=self.buffer,
                font="FreeSerif",
                font_size=10,
            )
            self.sup = False
        else:
            #print self.buffer
            para = Paragraph(
                text=self.buffer,
                font="FreeSerif",
                font_size=10,
            )

        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None

        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)
        #        f= open("computer_para.txt","aw")
        #        f.write(self.buffer)
        #        f.write("\n")
        #        f.close()
        self.buffer = None

    def set_header(self, text):
        self.header = text

    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link = imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts) - 1]
            output_filename = os.path.join(outputfolder, filename)
            #output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            infile = opener.open(link)
            page = infile.read()
            f = open(output_filename, "w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return output_filename

    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
        #        f= open("computer.txt","w")
        #        f.write(page)
        #        f.close()
        #        f = open("computer.txt","r")
        #        page=f.read()
        #        f.close()
        "Parse the given string 's'."
        self.feed(page)
        self.close()
        self.pdf.flush()
Exemplo n.º 18
0
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.

import sys
sys.path.append("../src/")  #not good!
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__ == "__main__":
    pdf = PDFWriter("image.pdf", StandardPaper.A4)
    header = Header(text_align=pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align=pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    image = Image()
    image.set_image_file("White_peacock.jpg")
    pdf.add_image(image)
    pdf.flush()
    """
    table = Table(border_width=1)
    row = Row(height=50)
Exemplo n.º 19
0
# pypdflib is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with pypdflib.  If not, see <http://www.gnu.org/licenses/>.
import sys
sys.path.append("../")  
from pypdflib.writer import PDFWriter
from pypdflib.widgets import *
from pypdflib.styles import *
import pango

if __name__=="__main__":
    pdf = PDFWriter("output.pdf",595, 842)
    header = Header(text_align = pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    h1= Text("Samples",font_size=16) 
    pdf.add_h1(h1)
    h2= Text("Malayalam",font_size=14) 
    pdf.add_h2(h2)
    para_file_malayalam=open("malayalam.txt")
    #image = Image(image_file="Four_Sons_of_Dasaratha.png")
    #pdf.add_image(image)
Exemplo n.º 20
0
class Wikiparser(SGMLParser):
    def __init__(self, url, filename, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.url = url
        self.language = detect_language(url)
        tmp_folder = os.path.join(os.path.dirname(__file__), "tmp")
        self.pdf = PDFWriter(os.path.join(tmp_folder, filename), StandardPaper.A4)
        header = Header(text_align = pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(urllib.unquote(self.url))
        self.pdf.set_header(header)
        self.pdf.move_context(0,500)
        h1= Text(urllib.unquote(self.url.split("/")[-1]),font="FreeSerif",font_size=32) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        
        h2= Text(urllib.unquote(self.url),font="FreeSerif",font_size=16) 
        h2.color = StandardColors.Blue
        self.pdf.add_text(h2)
        footer = Footer(text_align = pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()
        
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key == 'src'] 
        if src:
            self.images.extend(src)
            
    def end_img(self):
        for wiki_image in self.images:
            image  = Image()  
            outpath = self.grab_image(wiki_image, "/tmp")
            image.set_image_file(outpath)
            self.pdf.add_image(image)
        self.images = []

            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font="FreeSerif",font_size=16) 
        h1.color = StandardColors.Blue
        self.pdf.add_text(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font="FreeSerif",font_size=14) 
            h2.color = StandardColors.Blue
            self.pdf.add_text(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_text(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="FreeSerif",font_size=10,)
        para.set_justify(True)
        if self.language:
            para.language = self.language
        else:
            para.language = None
            
        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)   
        self.buffer = None
    def set_header(self,text):
        self.header = text
    def grab_image(self, imageurl, outputfolder):
        """
        Get the image from wiki
        """
        output_filename = None
        try:
            link= imageurl.strip()
            parts = link.split("/")
            filename = parts[len(parts)-1]
            output_filename = os.path.join(outputfolder , filename)
            #output_filename=urllib.unquote(output_filename)
            print("GET IMAGE " + link + " ==> " + output_filename)
            if os.path.isfile(output_filename):
                print("File " + output_filename + " already exists")
                return output_filename
            opener = urllib2.build_opener()
            opener.addheaders = [('User-agent', 'Mozilla/5.0')]
            infile = opener.open(link)
            page = infile.read()
            f= open(output_filename,"w")
            f.write(page)
            f.close()
        except KeyboardInterrupt:
            sys.exit()
        except urllib2.HTTPError:
            print("Error: Cound not download the image")
            pass
        return  output_filename

    def parse(self):
        opener = urllib2.build_opener()
        o = urlparse.urlparse(self.url)
        base = o.scheme+"://"+o.netloc
        filename = self.url.split("/")[-1] 
        quotedfilename = urllib.quote(filename.encode('utf-8')) 
        link = base +"/wiki/"+quotedfilename
        print "Get : " + link
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(link)
        page = infile.read()
        page = cleanup(page)
        "Parse the given string 's'."
        self.feed(page)
        self.close()
        self.pdf.flush()