Пример #1
0
class Wikiparser(SGMLParser):
    def __init__(self, url, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."
        SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.url = url
        self.pdf = PDFWriter(self.url.split("/")[-1] +".pdf",595, 842)
        header = Header(text_align = pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text(self.url)
        self.pdf.set_header(header)
        h1= Text(self.url.split("/")[-1],font="Dyuthi",font_size=32) 
        self.pdf.add_h1(h1)
        h2= Text(self.url,font="Rachan",font_size=16) 
        self.pdf.add_h2(h2)
        footer = Footer(text_align = pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        self.pdf.set_footer(footer)
        self.pdf.page_break()
        
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []
        #TODO Alignment not working.
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key=='src'] 
        if src:
            self.images.extend(src)
            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font="Dyuthi",font_size=16) 
        self.pdf.add_h1(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font="Rachana",font_size=14) 
            self.pdf.add_h2(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_li(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="Rachana",font_size=10,)
        para.set_justify(True)
        para.language = "ml_IN"
        para.set_hyphenate(True)
        self.pdf.add_paragraph(para)   
        self.buffer = None
    def set_header(self,text):
        self.header = text
        
    def parse(self):
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        infile = opener.open(self.url)
        page = infile.read()
        page = cleanup(page)
        "Parse the given string 's'."
        self.feed(page)
        self.close()
Пример #2
0
import pango

if __name__=="__main__":
    pdf = PDFWriter("output.pdf",595, 842)
    header = Header(text_align = pango.ALIGN_CENTER)
    #TODO Alignment not working.
    header.set_text("test header")
    pdf.set_header(header)
    footer = Footer(text_align = pango.ALIGN_CENTER)
    footer.set_text("test footer")
    #TODO Alignment not working.
    pdf.set_footer(footer)
    h1= Text("Samples",font_size=16) 
    pdf.add_h1(h1)
    h2= Text("Malayalam",font_size=14) 
    pdf.add_h2(h2)
    para_file_malayalam=open("malayalam.txt")
    #image = Image(image_file="Four_Sons_of_Dasaratha.png")
    #pdf.add_image(image)
    while True:
        para_content = para_file_malayalam.readline()
        if para_content ==None or para_content=="" : break 
        para = Paragraph(text=para_content, font="Rachana")
        pdf.add_paragraph(para)
    h2= Text("Hindi",font_size=14, font="Rachana") 
    pdf.add_h2(h2)
    para_file_hindi=open("hindi.txt")
    
    while True:
        para_content = para_file_hindi.readline()
        if para_content ==None or para_content=="" : break 
Пример #3
0
class Wikiparser(SGMLParser):
    def reset(self):                              
        SGMLParser.reset(self)
        self.images = []
        #TODO make the output file configurable- take it from command line
        self.pdf = PDFWriter("output.pdf",595, 842)
        header = Header(text_align = pango.ALIGN_CENTER)
        #TODO Alignment not working.
        header.set_text("A wikipedia article")
        self.pdf.set_header(header)
        footer = Footer(text_align = pango.ALIGN_CENTER)
        footer.set_text("wiki2pdf")
        #TODO Alignment not working.
        self.pdf.set_footer(footer)
        self.h1 = False
        self.h2 = False
        self.li = False
        self.p = False
        self.a = False
        self.ul = False
        self.ol = False
        self.span = False
        self.buffer = None
        
    def handle_data(self,data):
        if data.strip() == "": return
        if self.p or self.h1 or self.h2 or self.a or self.span:
            if self.buffer!=None:
                self.buffer+= data
            
                
    def start_img(self, attrs):         
        src = [value for key, value in attrs if key=='src'] 
        if src:
            self.images.extend(src)
            
    def start_h1(self, attrs):         
        self.h1=True
        self.buffer=""
        
    def end_h1(self):
        self.h1=False
        h1= Text(self.buffer,font_size=16) 
        self.pdf.add_h1(h1)
        self.buffer = None
        
    def start_h2(self, attrs):         
        self.h2=True
        self.buffer=""
        
    def end_h2(self):
        self.h2=False
        if self.buffer and self.buffer.strip()>"":
            h2= Text(self.buffer,font_size=14) 
            self.pdf.add_h2(h2)
        self.buffer = None
        
    def start_li(self, attrs):         
        self.li=True
        self.buffer=""
        
    def end_li(self):
        self.li=False
        if self.buffer and self.buffer.strip()>"":
            if self.ul:
                li= Text("• "+self.buffer,font_size=10) 
            else:
                li= Text(self.buffer,font_size=10)     
            self.pdf.add_li(li)
        self.buffer = None
                
    def start_a(self, attrs):         
        self.a = True
        
    def end_a(self):
        self.a = False
        
    def start_ol(self,attrs):
        self.ol=True    
    def end_ol(self):
        self.ol=False
        
    def start_ul(self,attrs):
        self.ul=True    
    def end_ul(self):
        self.ul=False
            
    def start_span(self, attrs):         
        self.span=True
        if self.buffer==None:
            self.buffer=""  
        
    def end_span(self):
        self.buffer+=" "
        self.span=False
            
    def start_p(self,attrs):
        self.p=True
        self.buffer=""
        
    def end_p(self) :
        self.p=False
        para = Paragraph(text=self.buffer, font="Rachana",font_size=10,)
        para.set_justify(True)
        self.pdf.add_paragraph(para)   
        self.buffer = None