class Wikiparser(HTMLParser): def __init__(self, url, verbose=0): "Initialise an object, passing 'verbose' to the superclass." HTMLParser.__init__(self) self.hyperlinks = [] self.url = url self.language = detect_language(url) self.pdf = PDFWriter( urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4) header = Header(text_align=pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text(urllib.unquote(self.url)) self.pdf.set_header(header) self.pdf.move_context(0, 500) h1 = Text(urllib.unquote(self.url.split("/")[-1]), font="serif", font_size=32) h1.color = StandardColors.Blue self.pdf.add_text(h1) h2 = Text(urllib.unquote(self.url), font="serif", font_size=16) h2.color = StandardColors.Blue self.pdf.add_text(h2) footer = Footer(text_align=pango.ALIGN_CENTER) footer.set_text("wiki2pdf") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): HTMLParser.reset(self) self.images = [] #TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.table = False self.tr = False self.th = False self.td = False self.caption = False self.reference = False self.ref_counter = 0 self.column_counter = 0 self.current_counter = 0 self.buffer = None self.sup = False def handle_data(self, data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption: if self.buffer != None: self.buffer += data def handle_starttag(self, tag, attrs): if tag == 'img' and not self.table: self.start_img(attrs) elif tag == 'h1': self.start_h1(attrs) elif tag == 'h2': self.start_h2(attrs) elif tag == 'li': self.start_li(attrs) elif tag == 'p': self.start_p(attrs) elif tag == 'a': self.start_a(attrs) elif tag == 'ul': self.start_ul(attrs) elif tag == 'ol': self.start_ol(attrs) elif tag == 'table': self.start_table(attrs) elif tag == 'tr' and self.table: self.start_tr(attrs) elif tag == 'td' and self.table: self.start_td(attrs) elif tag == 'th' and self.table: self.start_th(attrs) elif tag == 'caption' and self.table: self.start_caption(attrs) elif tag == 'span': self.start_span(attrs) elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u': if self.reference == False and self.table == False: if self.buffer != None: self.buffer += "<" + tag + ">" self.sup = True def handle_endtag(self, tag): if tag == 'img' and not self.table: self.end_img() elif tag == 'h1': self.end_h1() elif tag == 'h2': self.end_h2() elif tag == 'li': self.end_li() elif tag == 'p': self.end_p() elif tag == 'a': self.end_a() elif tag == 'ul': self.end_ul() elif tag == 'ol': self.end_ol() elif tag == 'table': self.end_table() elif tag == 'tr' and self.table: self.end_tr() elif tag == 'td' and self.table: self.end_td() elif tag == 'th' and self.table: self.end_th() elif tag == 'caption' and self.table: self.end_caption() elif tag == 'span': self.end_span() elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u': if self.sup and self.buffer != None: self.buffer += "</" + str(tag) + ">" def start_img(self, attrs): src = [value for key, value in attrs if key == 'src'] if src: self.images.extend(src) def end_img(self): for wiki_image in self.images: image = Image() outpath = self.grab_image(wiki_image, "/tmp") image.set_image_file(outpath) self.pdf.add_image(image) self.images = [] def start_h1(self, attrs): self.h1 = True self.buffer = "" def end_h1(self): self.h1 = False h1 = Text(self.buffer, font="FreeSerif", font_size=16) h1.color = StandardColors.Blue self.pdf.add_text(h1) self.buffer = None def start_h2(self, attrs): self.h2 = True self.buffer = "" def end_h2(self): self.h2 = False if self.buffer and self.buffer.strip() > "": h2 = Text(self.buffer, font="FreeSerif", font_size=14) h2.color = StandardColors.Blue self.pdf.add_text(h2) self.buffer = None def start_caption(self, attrs): self.caption = True self.buffer = "" def end_caption(self): self.caption = False if self.buffer and self.buffer.strip() > "": caption = Text(self.buffer, font="FreeSerif", font_size=14) caption.color = StandardColors.Blue self.pdf.add_text(caption) self.buffer = None def start_li(self, attrs): self.li = True self.buffer = "" def end_li(self): self.li = False # print self.buffer if self.buffer and self.buffer.strip() > "": if self.ul: li = Text(markup="• " + self.buffer, font="FreeSerif", font_size=10) elif self.ol: self.ref_counter += 1 li = Text(markup=str(self.ref_counter) + ". " + self.buffer.replace("↑", ""), font="FreeSerif", font_size=10) else: li = Text(markup=self.buffer, font="FreeSerif", font_size=10) self.pdf.add_text(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_table(self, attrs): for tups in attrs: if 'class' in tups: if tups[1] == 'wikitable': self.table = True self.wikitable = Table(border_width=1) self.wikitable.cell_padding = [2, 2, 2, 2] def end_table(self): if self.table: self.table = False self.pdf.add_table(self.wikitable) def start_tr(self, attrs): self.tr = True self.row = Row(height=25) self.current_counter = 0 def end_tr(self): self.tr = False if self.current_counter == self.column_counter: self.wikitable.add_row(self.row) def start_td(self, attrs): self.td = True self.buffer = "" def end_td(self): self.td = False # print self.buffer + " " + str(len(self.buffer)) cell_content = Text(self.buffer, font_size=10) cell_content.color = Color(0.0, 0.0, 0.0, 1.0) cell = Cell(cell_content, font_size=8, width=100) self.row.add_cell(cell) self.current_counter += 1 self.buffer = None def start_th(self, attrs): self.th = True self.buffer = "" def end_th(self): self.th = False # print self.buffer + " " + str(len(self.buffer)) cell_content = Text(self.buffer, font_size=10) cell_content.color = Color(0.0, 0.0, 0.0, 1.0) cell = Cell(cell_content, font_size=8, width=100) self.row.add_cell(cell) self.column_counter += 1 self.current_counter += 1 self.buffer = None # def start_sup(self, attrs): # self.sup = True # self.buffer += "<sup>" # # def end_sup(self): # print "test" # self.buffer += "</sup>" def start_ol(self, attrs): self.ol = True for tups in attrs: if 'class' in tups: if tups[1] == 'references': self.reference = True def end_ol(self): self.ol = False self.ref_counter = 0 if self.reference: self.reference = False #self.sup = False def start_ul(self, attrs): self.ul = True def end_ul(self): self.ul = False def start_span(self, attrs): self.span = True if self.buffer == None: self.buffer = "" def end_span(self): self.buffer += " " self.span = False def start_p(self, attrs): self.p = True self.buffer = "" def end_p(self): self.p = False if self.sup: para = Paragraph( markup=self.buffer, text=self.buffer, font="FreeSerif", font_size=10, ) self.sup = False else: #print self.buffer para = Paragraph( text=self.buffer, font="FreeSerif", font_size=10, ) para.set_justify(True) if self.language: para.language = self.language else: para.language = None para.set_hyphenate(True) self.pdf.add_paragraph(para) # f= open("computer_para.txt","aw") # f.write(self.buffer) # f.write("\n") # f.close() self.buffer = None def set_header(self, text): self.header = text def grab_image(self, imageurl, outputfolder): """ Get the image from wiki """ output_filename = None try: link = imageurl.strip() parts = link.split("/") filename = parts[len(parts) - 1] output_filename = os.path.join(outputfolder, filename) #output_filename=urllib.unquote(output_filename) print("GET IMAGE " + link + " ==> " + output_filename) if os.path.isfile(output_filename): print("File " + output_filename + " already exists") return output_filename opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(link) page = infile.read() f = open(output_filename, "w") f.write(page) f.close() except KeyboardInterrupt: sys.exit() except urllib2.HTTPError: print("Error: Cound not download the image") pass return output_filename def parse(self): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(self.url) page = infile.read() page = cleanup(page) # f= open("computer.txt","w") # f.write(page) # f.close() # f = open("computer.txt","r") # page=f.read() # f.close() "Parse the given string 's'." self.feed(page) self.close() self.pdf.flush()
sys.path.append("../") from pypdflib.writer import PDFWriter from pypdflib.widgets import * from pypdflib.styles import * import pango if __name__=="__main__": pdf = PDFWriter("output.pdf",595, 842) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text("test header") pdf.set_header(header) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("test footer") #TODO Alignment not working. pdf.set_footer(footer) h1= Text("Samples",font_size=16) pdf.add_h1(h1) h2= Text("Malayalam",font_size=14) pdf.add_h2(h2) para_file_malayalam=open("malayalam.txt") #image = Image(image_file="Four_Sons_of_Dasaratha.png") #pdf.add_image(image) while True: para_content = para_file_malayalam.readline() if para_content ==None or para_content=="" : break para = Paragraph(text=para_content, font="Rachana") pdf.add_paragraph(para) h2= Text("Hindi",font_size=14, font="Rachana") pdf.add_h2(h2) para_file_hindi=open("hindi.txt")
import sys sys.path.append("../src/") #not good! from pypdflib.writer import PDFWriter from pypdflib.widgets import * from pypdflib.styles import * import pango if __name__=="__main__": pdf = PDFWriter("tables.pdf",StandardPaper.A4) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text("test header") pdf.set_header(header) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("test footer") #TODO Alignment not working. pdf.set_footer(footer) table = Table(border_width=1) table.cell_padding = [2, 2, 2, 2] row = Row(height=100) for i in range(4): cell_content = Text("SampleCell "+str(i),font_size=14) cell_content.color = Color(0.0,0.0,0.0,1.0) cell = Cell(cell_content, font_size=8,width=100) row.add_cell(cell) for i in range(4): table.add_row(row) pdf.add_table(table) pdf.flush()
class Wikiparser(HTMLParser): def __init__(self, url, verbose=0): "Initialise an object, passing 'verbose' to the superclass." HTMLParser.__init__(self) self.hyperlinks = [] self.url = url self.language = detect_language(url) self.pdf = PDFWriter(urllib.unquote(self.url.split("/")[-1]) + ".pdf", StandardPaper.A4) header = Header(text_align=pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text(urllib.unquote(self.url)) self.pdf.set_header(header) self.pdf.move_context(0, 500) h1 = Text(urllib.unquote(self.url.split("/")[-1]), font="serif", font_size=32) h1.color = StandardColors.Blue self.pdf.add_text(h1) h2 = Text(urllib.unquote(self.url), font="serif", font_size=16) h2.color = StandardColors.Blue self.pdf.add_text(h2) footer = Footer(text_align=pango.ALIGN_CENTER) footer.set_text("wiki2pdf") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): HTMLParser.reset(self) self.images = [] #TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.table = False self.tr = False self.th = False self.td = False self.caption = False self.reference = False self.ref_counter = 0 self.column_counter = 0 self.current_counter = 0 self.buffer = None self.sup = False def handle_data(self, data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span or self.li or self.td or self.th or self.caption: if self.buffer != None: self.buffer += data def handle_starttag(self, tag, attrs): if tag == 'img'and not self.table: self.start_img(attrs) elif tag == 'h1': self.start_h1(attrs) elif tag == 'h2': self.start_h2(attrs) elif tag == 'li': self.start_li(attrs) elif tag == 'p': self.start_p(attrs) elif tag == 'a': self.start_a(attrs) elif tag == 'ul': self.start_ul(attrs) elif tag == 'ol': self.start_ol(attrs) elif tag == 'table': self.start_table(attrs) elif tag == 'tr' and self.table: self.start_tr(attrs) elif tag == 'td' and self.table: self.start_td(attrs) elif tag == 'th'and self.table: self.start_th(attrs) elif tag == 'caption' and self.table: self.start_caption(attrs) elif tag == 'span': self.start_span(attrs) elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u': if self.reference == False and self.table == False: if self.buffer != None: self.buffer += "<"+tag+">" self.sup = True def handle_endtag(self, tag): if tag == 'img' and not self.table: self.end_img() elif tag == 'h1': self.end_h1() elif tag == 'h2': self.end_h2() elif tag == 'li': self.end_li() elif tag == 'p': self.end_p() elif tag == 'a': self.end_a() elif tag == 'ul': self.end_ul() elif tag == 'ol': self.end_ol() elif tag == 'table': self.end_table() elif tag == 'tr' and self.table: self.end_tr() elif tag == 'td' and self.table: self.end_td() elif tag == 'th' and self.table: self.end_th() elif tag == 'caption' and self.table: self.end_caption() elif tag == 'span': self.end_span() elif tag == 'sup' or tag == 'sub' or tag == 'b' or tag == 'i' or tag == 's' or tag == 'small' or tag == 'big' or tag == 'tt' or tag == 'u': if self.sup and self.buffer != None: self.buffer += "</"+str(tag)+">" def start_img(self, attrs): src = [value for key, value in attrs if key == 'src'] if src: self.images.extend(src) def end_img(self): for wiki_image in self.images: image = Image() outpath = self.grab_image(wiki_image, "/tmp") image.set_image_file(outpath) self.pdf.add_image(image) self.images = [] def start_h1(self, attrs): self.h1 = True self.buffer = "" def end_h1(self): self.h1 = False h1 = Text(self.buffer, font="FreeSerif", font_size=16) h1.color = StandardColors.Blue self.pdf.add_text(h1) self.buffer = None def start_h2(self, attrs): self.h2 = True self.buffer = "" def end_h2(self): self.h2 = False if self.buffer and self.buffer.strip() > "": h2 = Text(self.buffer, font="FreeSerif", font_size=14) h2.color = StandardColors.Blue self.pdf.add_text(h2) self.buffer = None def start_caption(self, attrs): self.caption = True self.buffer = "" def end_caption(self): self.caption = False if self.buffer and self.buffer.strip() > "": caption = Text(self.buffer, font="FreeSerif", font_size=14) caption.color = StandardColors.Blue self.pdf.add_text(caption) self.buffer = None def start_li(self, attrs): self.li = True self.buffer = "" def end_li(self): self.li = False # print self.buffer if self.buffer and self.buffer.strip() > "": if self.ul: li = Text(markup = "• " + self.buffer,font="FreeSerif", font_size=10) elif self.ol: self.ref_counter+=1 li = Text(markup = str(self.ref_counter) + ". "+ self.buffer.replace("↑",""), font = "FreeSerif", font_size=10) else: li = Text(markup = self.buffer,font="FreeSerif", font_size=10) self.pdf.add_text(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_table(self, attrs): for tups in attrs: if 'class' in tups: if tups[1] == 'wikitable': self.table = True self.wikitable = Table(border_width = 1) self.wikitable.cell_padding = [2,2,2,2] def end_table(self): if self.table: self.table = False self.pdf.add_table(self.wikitable) def start_tr(self, attrs): self.tr = True self.row = Row(height=25) self.current_counter = 0 def end_tr(self): self.tr = False if self.current_counter == self.column_counter: self.wikitable.add_row(self.row) def start_td(self, attrs): self.td = True self.buffer = "" def end_td(self): self.td = False # print self.buffer + " " + str(len(self.buffer)) cell_content = Text(self.buffer,font_size=10) cell_content.color = Color(0.0,0.0,0.0,1.0) cell = Cell(cell_content, font_size=8,width=100) self.row.add_cell(cell) self.current_counter+=1 self.buffer = None def start_th(self, attrs): self.th = True self.buffer = "" def end_th(self): self.th = False # print self.buffer + " " + str(len(self.buffer)) cell_content = Text(self.buffer,font_size=10) cell_content.color = Color(0.0,0.0,0.0,1.0) cell = Cell(cell_content, font_size=8,width=100) self.row.add_cell(cell) self.column_counter+=1 self.current_counter+=1 self.buffer = None # def start_sup(self, attrs): # self.sup = True # self.buffer += "<sup>" # # def end_sup(self): # print "test" # self.buffer += "</sup>" def start_ol(self, attrs): self.ol = True for tups in attrs: if 'class' in tups: if tups[1] == 'references': self.reference = True def end_ol(self): self.ol = False self.ref_counter = 0 if self.reference: self.reference= False #self.sup = False def start_ul(self, attrs): self.ul = True def end_ul(self): self.ul = False def start_span(self, attrs): self.span = True if self.buffer == None: self.buffer = "" def end_span(self): self.buffer += " " self.span = False def start_p(self, attrs): self.p = True self.buffer = "" def end_p(self) : self.p = False if self.sup: para = Paragraph(markup=self.buffer,text = self.buffer, font="FreeSerif", font_size=10,) self.sup = False else: #print self.buffer para = Paragraph(text=self.buffer, font="FreeSerif", font_size=10,) para.set_justify(True) if self.language: para.language = self.language else: para.language = None para.set_hyphenate(True) self.pdf.add_paragraph(para) # f= open("computer_para.txt","aw") # f.write(self.buffer) # f.write("\n") # f.close() self.buffer = None def set_header(self, text): self.header = text def grab_image(self, imageurl, outputfolder): """ Get the image from wiki """ output_filename = None try: link= imageurl.strip() parts = link.split("/") filename = parts[len(parts)-1] output_filename = os.path.join(outputfolder , filename) #output_filename=urllib.unquote(output_filename) print("GET IMAGE " + link + " ==> " + output_filename) if os.path.isfile(output_filename): print("File " + output_filename + " already exists") return output_filename opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(link) page = infile.read() f= open(output_filename,"w") f.write(page) f.close() except KeyboardInterrupt: sys.exit() except urllib2.HTTPError: print("Error: Cound not download the image") pass return output_filename def parse(self): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(self.url) page = infile.read() page = cleanup(page) # f= open("computer.txt","w") # f.write(page) # f.close() # f = open("computer.txt","r") # page=f.read() # f.close() "Parse the given string 's'." self.feed(page) self.close() self.pdf.flush()
class Wikiparser(SGMLParser): def __init__(self, url, verbose=0): "Initialise an object, passing 'verbose' to the superclass." SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url = url self.pdf = PDFWriter(self.url.split("/")[-1] +".pdf",595, 842) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text(self.url) self.pdf.set_header(header) h1= Text(self.url.split("/")[-1],font="Dyuthi",font_size=32) self.pdf.add_h1(h1) h2= Text(self.url,font="Rachan",font_size=16) self.pdf.add_h2(h2) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("wiki2pdf") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): SGMLParser.reset(self) self.images = [] #TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key=='src'] if src: self.images.extend(src) def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font="Dyuthi",font_size=16) self.pdf.add_h1(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font="Rachana",font_size=14) self.pdf.add_h2(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_li(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="Rachana",font_size=10,) para.set_justify(True) para.language = "ml_IN" para.set_hyphenate(True) self.pdf.add_paragraph(para) self.buffer = None def set_header(self,text): self.header = text def parse(self): opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(self.url) page = infile.read() page = cleanup(page) "Parse the given string 's'." self.feed(page) self.close()
class Wikiparser2(HTMLParser): def __init__(self, inputfile, filename, font="serif", verbose=0): "Initialise an object, passing 'verbose' to the superclass." HTMLParser.__init__(self) self.hyperlinks = [] self.inputfile = inputfile self.inputfile = os.path.join(os.path.dirname(__file__), self.inputfile) f = file(self.inputfile) url = f.readline() f.close() self.font = font self.language = detect_language(url) tmp_folder = os.path.join(os.path.dirname(__file__), "tmp") self.pdf = PDFWriter(os.path.join(tmp_folder, filename), StandardPaper.A4) header = Header(text_align=pango.ALIGN_CENTER) # TODO Alignment not working. header.set_text(urllib.unquote(url)) self.pdf.set_header(header) self.pdf.move_context(0, 500) h1 = Text(urllib.unquote(url.split("/")[-1]), font=self.font, font_size=32) h1.color = StandardColors.Blue self.pdf.add_text(h1) h2 = Text(urllib.unquote(url), font=self.font, font_size=16) h2.color = StandardColors.Blue self.pdf.add_text(h2) footer = Footer(text_align=pango.ALIGN_CENTER) footer.set_text("wiki2book") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): HTMLParser.reset(self) self.images = [] # TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.reference = False self.ref_counter = 0 self.buffer = None self.sup = False def handle_data(self, data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span or self.li: if self.buffer != None: self.buffer += data def handle_starttag(self, tag, attrs): if tag == "img": self.start_img(attrs) elif tag == "h1": self.start_h1(attrs) elif tag == "h2": self.start_h2(attrs) elif tag == "li": self.start_li(attrs) elif tag == "p": self.start_p(attrs) elif tag == "a": self.start_a(attrs) elif tag == "ul": self.start_ul(attrs) elif tag == "ol": self.start_ol(attrs) elif tag == "span": self.start_span(attrs) elif ( tag == "sup" or tag == "sub" or tag == "b" or tag == "i" or tag == "s" or tag == "small" or tag == "big" or tag == "tt" or tag == "u" ): if self.reference == False: if self.buffer != None: self.buffer += "<" + tag + ">" self.sup = True def handle_endtag(self, tag): if tag == "img": self.end_img() elif tag == "h1": self.end_h1() elif tag == "h2": self.end_h2() elif tag == "li": self.end_li() elif tag == "p": self.end_p() elif tag == "a": self.end_a() elif tag == "ul": self.end_ul() elif tag == "ol": self.end_ol() elif tag == "span": self.end_span() elif ( tag == "sup" or tag == "sub" or tag == "b" or tag == "i" or tag == "s" or tag == "small" or tag == "big" or tag == "tt" or tag == "u" ): if self.sup and self.buffer != None: self.buffer += "</" + str(tag) + ">" def start_img(self, attrs): src = [value for key, value in attrs if key == "src"] if src: self.images.extend(src) def end_img(self): for wiki_image in self.images: image = Image() outpath = self.grab_image(wiki_image, "/tmp") image.set_image_file(outpath) self.pdf.add_image(image) self.images = [] def start_h1(self, attrs): self.h1 = True self.buffer = "" def end_h1(self): self.h1 = False h1 = Text(self.buffer, font=self.font, font_size=16) h1.color = StandardColors.Blue self.pdf.add_text(h1) self.buffer = None def start_h2(self, attrs): self.h2 = True self.buffer = "" def end_h2(self): self.h2 = False if self.buffer and self.buffer.strip() > "": h2 = Text(self.buffer, font=self.font, font_size=14) h2.color = StandardColors.Blue self.pdf.add_text(h2) self.buffer = None def start_li(self, attrs): self.li = True self.buffer = "" def end_li(self): self.li = False # print self.buffer if self.buffer and self.buffer.strip() > "": if self.ul: li = Text(markup="• " + self.buffer, font=self.font, font_size=10) elif self.ol: self.ref_counter += 1 li = Text( markup=str(self.ref_counter) + ". " + self.buffer.replace("↑", ""), font=self.font, font_size=10 ) else: li = Text(markup=self.buffer, font=self.font, font_size=10) self.pdf.add_text(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self, attrs): self.ol = True for tups in attrs: if "class" in tups: if tups[1] == "references": self.reference = True def end_ol(self): self.ol = False self.ref_counter = 0 if self.reference: self.reference = False # self.sup = False def start_ul(self, attrs): self.ul = True def end_ul(self): self.ul = False def start_span(self, attrs): self.span = True if self.buffer == None: self.buffer = "" def end_span(self): self.buffer += "" self.span = False def start_p(self, attrs): self.p = True self.buffer = "" def end_p(self): self.p = False if self.sup: para = Paragraph(markup=self.buffer, text=self.buffer, font=self.font, font_size=10) self.sup = False else: # print self.buffer para = Paragraph(text=self.buffer, font=self.font, font_size=10) para.set_justify(True) if self.language: para.language = self.language else: para.language = None para.set_hyphenate(True) self.pdf.add_paragraph(para) self.buffer = None def set_header(self, text): self.header = text def grab_image(self, imageurl, outputfolder): """ Get the image from wiki """ output_filename = None try: link = "https:" + imageurl.strip() parts = link.split("/") filename = parts[len(parts) - 1] output_filename = os.path.join(outputfolder, filename) # output_filename=urllib.unquote(output_filename) print("GET IMAGE " + link + " ==> " + output_filename) if os.path.isfile(output_filename): print("File " + output_filename + " already exists") return output_filename opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] infile = opener.open(link) page = infile.read() f = open(output_filename, "w") f.write(page) f.close() except KeyboardInterrupt: sys.exit() except urllib2.HTTPError: print("Error: Cound not download the image") pass return output_filename def parse(self): opener = urllib2.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] f = file(self.inputfile) while True: url = f.readline() if len(url) == 0: break infile = opener.open(url) page = infile.read() page = cleanup(page) "Parse the given string 's'." self.feed(page) self.pdf.page_break() self.close() self.pdf.flush()
class Wikiparser(SGMLParser): def reset(self): SGMLParser.reset(self) self.images = [] #TODO make the output file configurable- take it from command line self.pdf = PDFWriter("output.pdf",595, 842) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text("A wikipedia article") self.pdf.set_header(header) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("wiki2pdf") #TODO Alignment not working. self.pdf.set_footer(footer) self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key=='src'] if src: self.images.extend(src) def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font_size=16) self.pdf.add_h1(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font_size=14) self.pdf.add_h2(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_li(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="Rachana",font_size=10,) para.set_justify(True) self.pdf.add_paragraph(para) self.buffer = None
class HTMLParser(SGMLParser): def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.pdf = None def reset(self): SGMLParser.reset(self) self.images = [] self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key=='src'] if src: self.images.extend(src) def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font="Serif",font_size=16) self.pdf.add_text(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font="Serif",font_size=14) self.pdf.add_text(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_text(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="Serif",font_size=10,) para.set_justify(True) para.set_hyphenate(False) self.pdf.add_paragraph(para) self.buffer = None def parse(self, filename, outputfile): try: text = codecs.open(filename, 'r', 'utf-8').read() except IOError: # given filename could not be found return '' parts = publish_parts(text, writer=Writer(), settings_overrides=SETTINGS) if 'html_body' in parts: html = parts['html_body'] "Parse the given string 's'." self.pdf = PDFWriter(outputfile, StandardPaper.A4) footer = Footer() header = Header() self.pdf.set_footer(footer) self.pdf.set_header(header) self.feed(html) self.close() self.pdf.flush()
class Wikiparser(SGMLParser): def __init__(self, url, filename, verbose=0): "Initialise an object, passing 'verbose' to the superclass." SGMLParser.__init__(self, verbose) self.hyperlinks = [] self.url = url self.language = detect_language(url) tmp_folder = os.path.join(os.path.dirname(__file__), "tmp") self.pdf = PDFWriter(os.path.join(tmp_folder, filename), StandardPaper.A4) header = Header(text_align = pango.ALIGN_CENTER) #TODO Alignment not working. header.set_text(urllib.unquote(self.url)) self.pdf.set_header(header) self.pdf.move_context(0,500) h1= Text(urllib.unquote(self.url.split("/")[-1]),font="FreeSerif",font_size=32) h1.color = StandardColors.Blue self.pdf.add_text(h1) h2= Text(urllib.unquote(self.url),font="FreeSerif",font_size=16) h2.color = StandardColors.Blue self.pdf.add_text(h2) footer = Footer(text_align = pango.ALIGN_CENTER) footer.set_text("wiki2pdf") self.pdf.set_footer(footer) self.pdf.page_break() def reset(self): SGMLParser.reset(self) self.images = [] #TODO Alignment not working. self.h1 = False self.h2 = False self.li = False self.p = False self.a = False self.ul = False self.ol = False self.span = False self.buffer = None def handle_data(self,data): if data.strip() == "": return if self.p or self.h1 or self.h2 or self.a or self.span: if self.buffer!=None: self.buffer+= data def start_img(self, attrs): src = [value for key, value in attrs if key == 'src'] if src: self.images.extend(src) def end_img(self): for wiki_image in self.images: image = Image() outpath = self.grab_image(wiki_image, "/tmp") image.set_image_file(outpath) self.pdf.add_image(image) self.images = [] def start_h1(self, attrs): self.h1=True self.buffer="" def end_h1(self): self.h1=False h1= Text(self.buffer,font="FreeSerif",font_size=16) h1.color = StandardColors.Blue self.pdf.add_text(h1) self.buffer = None def start_h2(self, attrs): self.h2=True self.buffer="" def end_h2(self): self.h2=False if self.buffer and self.buffer.strip()>"": h2= Text(self.buffer,font="FreeSerif",font_size=14) h2.color = StandardColors.Blue self.pdf.add_text(h2) self.buffer = None def start_li(self, attrs): self.li=True self.buffer="" def end_li(self): self.li=False if self.buffer and self.buffer.strip()>"": if self.ul: li= Text("• "+self.buffer,font_size=10) else: li= Text(self.buffer,font_size=10) self.pdf.add_text(li) self.buffer = None def start_a(self, attrs): self.a = True def end_a(self): self.a = False def start_ol(self,attrs): self.ol=True def end_ol(self): self.ol=False def start_ul(self,attrs): self.ul=True def end_ul(self): self.ul=False def start_span(self, attrs): self.span=True if self.buffer==None: self.buffer="" def end_span(self): self.buffer+=" " self.span=False def start_p(self,attrs): self.p=True self.buffer="" def end_p(self) : self.p=False para = Paragraph(text=self.buffer, font="FreeSerif",font_size=10,) para.set_justify(True) if self.language: para.language = self.language else: para.language = None para.set_hyphenate(True) self.pdf.add_paragraph(para) self.buffer = None def set_header(self,text): self.header = text def grab_image(self, imageurl, outputfolder): """ Get the image from wiki """ output_filename = None try: link= imageurl.strip() parts = link.split("/") filename = parts[len(parts)-1] output_filename = os.path.join(outputfolder , filename) #output_filename=urllib.unquote(output_filename) print("GET IMAGE " + link + " ==> " + output_filename) if os.path.isfile(output_filename): print("File " + output_filename + " already exists") return output_filename opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(link) page = infile.read() f= open(output_filename,"w") f.write(page) f.close() except KeyboardInterrupt: sys.exit() except urllib2.HTTPError: print("Error: Cound not download the image") pass return output_filename def parse(self): opener = urllib2.build_opener() o = urlparse.urlparse(self.url) base = o.scheme+"://"+o.netloc filename = self.url.split("/")[-1] quotedfilename = urllib.quote(filename.encode('utf-8')) link = base +"/wiki/"+quotedfilename print "Get : " + link opener.addheaders = [('User-agent', 'Mozilla/5.0')] infile = opener.open(link) page = infile.read() page = cleanup(page) "Parse the given string 's'." self.feed(page) self.close() self.pdf.flush()