def parse(self, raw): # At first, content is blank tmp_content = "" # because there's no object tmp_objs = [] # Get <column> node attributes tmp_bottom = re.search(r"b=\"(\d+)\"", raw).group(1) tmp_top = re.search(r"t=\"(\d+)\"", raw).group(1) tmp_left = re.search(r"l=\"(\d+)\"", raw).group(1) tmp_right = re.search(r"r=\"(\d+)\"", raw).group(1) tmpRaw = raw tmpRaw = re.sub(r"^<column[^>]*>\n", "", tmpRaw) tmpRaw = re.sub(r"</column>(\n)?$", "", tmpRaw) sys.path.append("./LSRRTF/") import rtfPara import rtfTable while True: if re.search(r"^<para", tmpRaw) != None: para = rtfPara.rtfPara() indexParaEnd = tmpRaw.index("</para>") + len("</para>") tmpParaRaw = tmpRaw[:indexParaEnd] tmpRaw = tmpRaw[indexParaEnd + 1 :] # Set raw content para.set_raw(tmpParaRaw) # Update paragraph list tmp_objs.append(para) # Update content tmp_content = tmp_content + para.get_content() + "\n" elif re.search(r"^<dd", tmpRaw) != None: sys.strerr.write("Para::dd::Neimplementovano\n") indexDdEnd = tmpRaw.index("</dd>") + len("</dd>") # tmpDdRaw = tmpRaw[:indexDdEnd] tmpRaw = tmpRaw[indexDdEnd + 1 :] elif re.search(r"^<table", tmpRaw) != None: import rtfTable table = rtfTable.rtfTable() indexTableEnd = tmpRaw.index("</table>") + len("</table>") tmpTableRaw = tmpRaw[:indexTableEnd] tmpRaw = tmpRaw[indexTableEnd + 1 :] # Set raw content table.set_raw(tmpTableRaw) # Update paragraph list tmp_objs.append(table) # Update content tmp_content = tmp_content + table.get_content() + "\n" elif re.search(r"^<image", tmpRaw) != None: # import rtfImage # table = rtfTable.rtfTable() indexImageEnd = tmpRaw.index("</image>") + len("</image>") # tmpTableRaw = tmpRaw[:indexTableEnd] tmpRaw = tmpRaw[indexImageEnd + 1 :] # Set raw content # table.set_raw(tmpTableRaw) # Update paragraph list # tmp_objs.append(table) # Update content # tmp_content = tmp_content + table.get_content() + "\n" elif re.search(r"^<frame", tmpRaw) != None: import rtfFrame frame = rtfFrame.rtfFrame() indexFrameEnd = tmpRaw.index("</frame>") + len("</frame>") tmpFrameRaw = tmpRaw[:indexFrameEnd] tmpRaw = tmpRaw[indexFrameEnd + 1 :] # Set raw content frame.set_raw(tmpFrameRaw) # Update paragraph list tmp_objs.append(frame) # Update content tmp_content = tmp_content + frame.get_content() + "\n" else: break self.members["_bottom"] = tmp_bottom self.members["_top"] = tmp_top self.members["_left"] = tmp_left self.members["_right"] = tmp_right self.members["_objs"] = tmp_objs self.members["_content"] = tmp_content
def parse(self, raw): global tmp_content global tmp_objs # At first, content is blank tmp_content = ""; # because there's no column, table or image tmp_objs = []; tmpRaw = raw tmpRaw = re.sub(r"<page>\n", "", tmpRaw) tmpRaw = re.sub(r"</page>(\n)?", "", tmpRaw) tmpRaw = re.sub(r"<body>\n", "", tmpRaw) tmpRaw = re.sub(r"</body>(\n)?", "", tmpRaw) #if re.search(r"^<section", tmpRaw) != None: # section_tag = tmpRaw # tmpRaw = re.sub(r"^<section[^>]*>\n", "", tmpRaw) # tmpRaw = re.sub(r"</section>\n", "", tmpRaw, 1) sys.path.append("./LSRRTF/") import rtfCol while True: if re.search(r"^<section", tmpRaw) != None: tmpSecRaw = tmpRaw tmpSecRaw = re.sub(r"^<section[^>]*>\n", "", tmpSecRaw) tmpSecRaw = re.sub(r"</section>\n", "", tmpSecRaw, 1) while True: if re.search(r"^<column", tmpSecRaw) != None: column = rtfCol.rtfCol() indexColEnd = tmpSecRaw.index("</column>") + len("</column>") tmpColRaw = tmpSecRaw[:indexColEnd] tmpSecRaw = tmpSecRaw[indexColEnd+1:] #Set raw content column.set_raw(tmpColRaw) #Update column list tmp_objs.append(column) #Update content tmp_content += column.get_content() + "\n" #print tmp_content elif re.search(r"^<dd", tmpSecRaw) != None: import rtfDd dd = rtfDd.rtfDd() indexDdEnd = tmpSecRaw.index("</dd>") + len("</dd>") tmpDdRaw = tmpSecRaw[:indexDdEnd] tmpSecRaw = tmpSecRaw[indexDdEnd+1:] #Set raw content dd.set_raw(tmpDdRaw) #Update dd list tmp_objs.append(dd) #Update content tmp_content += dd.get_content() + "\n" elif re.search(r"^<frame", tmpSecRaw) != None: import rtfFrame frame = rtfFrame.rtfFrame() indexFrameEnd = tmpSecRaw.index("</frame>") + len("</frame>") tmpFrameRaw = tmpSecRaw[:indexFrameEnd] tmpSecRaw = tmpSecRaw[indexFrameEnd+1:] #Set raw content dd.set_raw(tmpFrameRaw) #Update dd list tmp_objs.append(frame) #Update content tmp_content += frame.get_content() + "\n" else: break tmpRaw = tmpSecRaw #print tmp_content #print tmpRaw #sys.exit(0) elif re.search(r"^<column", tmpSecRaw) != None: column = rtfCol.rtfCol() indexColEnd = tmpRaw.index("</column>") + len("</column>") tmpColRaw = tmpRaw[:indexColEnd] tmpRaw = tmpRaw[indexColEnd+1:] #Set raw content column.set_raw(tmpColRaw) #Update column list tmp_objs.append(column) #Update content tmp_content += column.get_content() + "\n" #print tmp_content elif re.search(r"^<dd", tmpSecRaw) != None: import rtfDd dd = rtfDd.rtfDd() indexDdEnd = tmpRaw.index("</dd>") + len("</dd>") tmpDdRaw = tmpRaw[:indexDdEnd] tmpRaw = tmpRaw[indexDdEnd+1:] #Set raw content dd.set_raw(tmpDdRaw) #Update dd list tmp_objs.append(dd) #Update content tmp_content += dd.get_content() + "\n" elif re.search(r"^<frame", tmpSecRaw) != None: import rtfFrame frame = rtfFrame.rtfFrame() indexFrameEnd = tmpRaw.index("</frame>") + len("</frame>") tmpFrameRaw = tmpRaw[:indexFrameEnd] tmpRaw = tmpRaw[indexFrameEnd+1:] #Set raw content dd.set_raw(tmpFrameRaw) #Update dd list tmp_objs.append(frame) #Update content tmp_content += frame.get_content() + "\n" else: break