示例#1
0
    def parse(self, raw):
        # At first, content is blank
        tmp_content = ""
        # because there's no object
        tmp_objs = []

        # Get <column> node attributes
        tmp_bottom = re.search(r"b=\"(\d+)\"", raw).group(1)
        tmp_top = re.search(r"t=\"(\d+)\"", raw).group(1)
        tmp_left = re.search(r"l=\"(\d+)\"", raw).group(1)
        tmp_right = re.search(r"r=\"(\d+)\"", raw).group(1)

        tmpRaw = raw
        tmpRaw = re.sub(r"^<column[^>]*>\n", "", tmpRaw)
        tmpRaw = re.sub(r"</column>(\n)?$", "", tmpRaw)

        sys.path.append("./LSRRTF/")
        import rtfPara
        import rtfTable

        while True:
            if re.search(r"^<para", tmpRaw) != None:
                para = rtfPara.rtfPara()

                indexParaEnd = tmpRaw.index("</para>") + len("</para>")
                tmpParaRaw = tmpRaw[:indexParaEnd]
                tmpRaw = tmpRaw[indexParaEnd + 1 :]

                # Set raw content
                para.set_raw(tmpParaRaw)

                # Update paragraph list
                tmp_objs.append(para)

                # Update content
                tmp_content = tmp_content + para.get_content() + "\n"
            elif re.search(r"^<dd", tmpRaw) != None:
                sys.strerr.write("Para::dd::Neimplementovano\n")
                indexDdEnd = tmpRaw.index("</dd>") + len("</dd>")
                # tmpDdRaw = tmpRaw[:indexDdEnd]
                tmpRaw = tmpRaw[indexDdEnd + 1 :]
            elif re.search(r"^<table", tmpRaw) != None:
                import rtfTable

                table = rtfTable.rtfTable()

                indexTableEnd = tmpRaw.index("</table>") + len("</table>")
                tmpTableRaw = tmpRaw[:indexTableEnd]
                tmpRaw = tmpRaw[indexTableEnd + 1 :]

                # Set raw content
                table.set_raw(tmpTableRaw)

                # Update paragraph list
                tmp_objs.append(table)

                # Update content
                tmp_content = tmp_content + table.get_content() + "\n"
            elif re.search(r"^<image", tmpRaw) != None:
                # import rtfImage
                # table = rtfTable.rtfTable()

                indexImageEnd = tmpRaw.index("</image>") + len("</image>")
                # tmpTableRaw = tmpRaw[:indexTableEnd]
                tmpRaw = tmpRaw[indexImageEnd + 1 :]

                # Set raw content
                # table.set_raw(tmpTableRaw)

                # Update paragraph list
                # tmp_objs.append(table)

                # Update content
                # tmp_content = tmp_content + table.get_content() + "\n"
            elif re.search(r"^<frame", tmpRaw) != None:
                import rtfFrame

                frame = rtfFrame.rtfFrame()

                indexFrameEnd = tmpRaw.index("</frame>") + len("</frame>")
                tmpFrameRaw = tmpRaw[:indexFrameEnd]
                tmpRaw = tmpRaw[indexFrameEnd + 1 :]

                # Set raw content
                frame.set_raw(tmpFrameRaw)

                # Update paragraph list
                tmp_objs.append(frame)

                # Update content
                tmp_content = tmp_content + frame.get_content() + "\n"
            else:
                break

        self.members["_bottom"] = tmp_bottom
        self.members["_top"] = tmp_top
        self.members["_left"] = tmp_left
        self.members["_right"] = tmp_right
        self.members["_objs"] = tmp_objs
        self.members["_content"] = tmp_content
示例#2
0
 def parse(self, raw):
     global tmp_content
     global tmp_objs
     
     # At first, content is blank
     tmp_content = "";
     # because there's no column, table or image
     tmp_objs = [];
     tmpRaw = raw
     tmpRaw = re.sub(r"<page>\n", "", tmpRaw)
     tmpRaw = re.sub(r"</page>(\n)?", "", tmpRaw)
     tmpRaw = re.sub(r"<body>\n", "", tmpRaw)
     tmpRaw = re.sub(r"</body>(\n)?", "", tmpRaw)
     
     #if re.search(r"^<section", tmpRaw) != None:
     #    section_tag = tmpRaw
     #    tmpRaw = re.sub(r"^<section[^>]*>\n", "", tmpRaw)
     #    tmpRaw = re.sub(r"</section>\n", "", tmpRaw, 1)
     sys.path.append("./LSRRTF/")
     import rtfCol
     
     while True:
         if re.search(r"^<section", tmpRaw) != None:
             tmpSecRaw = tmpRaw
             tmpSecRaw = re.sub(r"^<section[^>]*>\n", "", tmpSecRaw)
             tmpSecRaw = re.sub(r"</section>\n", "", tmpSecRaw, 1)
             
             while True:
                 if re.search(r"^<column", tmpSecRaw) != None:
                     column = rtfCol.rtfCol()
                     
                     indexColEnd = tmpSecRaw.index("</column>") + len("</column>")
                     tmpColRaw = tmpSecRaw[:indexColEnd]
                     tmpSecRaw = tmpSecRaw[indexColEnd+1:]
                     
                     #Set raw content
                     column.set_raw(tmpColRaw)
                     
                     #Update column list
                     tmp_objs.append(column)
                     
                     #Update content
                     tmp_content += column.get_content() + "\n"
                     #print tmp_content
                 elif re.search(r"^<dd", tmpSecRaw) != None:
                     import rtfDd
                     dd = rtfDd.rtfDd()
                     
                     indexDdEnd = tmpSecRaw.index("</dd>") + len("</dd>")
                     tmpDdRaw = tmpSecRaw[:indexDdEnd]
                     tmpSecRaw = tmpSecRaw[indexDdEnd+1:]
                     
                     #Set raw content
                     dd.set_raw(tmpDdRaw)
                     
                     #Update dd list
                     tmp_objs.append(dd)
                     
                     #Update content
                     tmp_content += dd.get_content() + "\n"
                 elif re.search(r"^<frame", tmpSecRaw) != None:
                     import rtfFrame
                     frame = rtfFrame.rtfFrame()
                     
                     indexFrameEnd = tmpSecRaw.index("</frame>") + len("</frame>")
                     tmpFrameRaw = tmpSecRaw[:indexFrameEnd]
                     tmpSecRaw = tmpSecRaw[indexFrameEnd+1:]
                     
                     #Set raw content
                     dd.set_raw(tmpFrameRaw)
                     
                     #Update dd list
                     tmp_objs.append(frame)
                     
                     #Update content
                     tmp_content += frame.get_content() + "\n"
                 else:
                     break
             tmpRaw = tmpSecRaw
             #print tmp_content
             #print tmpRaw
             #sys.exit(0)
         elif re.search(r"^<column", tmpSecRaw) != None:
             column = rtfCol.rtfCol()
             
             indexColEnd = tmpRaw.index("</column>") + len("</column>")
             tmpColRaw = tmpRaw[:indexColEnd]
             tmpRaw = tmpRaw[indexColEnd+1:]
             
             #Set raw content
             column.set_raw(tmpColRaw)
             
             #Update column list
             tmp_objs.append(column)
             
             #Update content
             tmp_content += column.get_content() + "\n"
             #print tmp_content
         elif re.search(r"^<dd", tmpSecRaw) != None:
             import rtfDd
             dd = rtfDd.rtfDd()
             
             indexDdEnd = tmpRaw.index("</dd>") + len("</dd>")
             tmpDdRaw = tmpRaw[:indexDdEnd]
             tmpRaw = tmpRaw[indexDdEnd+1:]
             
             #Set raw content
             dd.set_raw(tmpDdRaw)
             
             #Update dd list
             tmp_objs.append(dd)
             
             #Update content
             tmp_content += dd.get_content() + "\n"
         elif re.search(r"^<frame", tmpSecRaw) != None:
             import rtfFrame
             frame = rtfFrame.rtfFrame()
             
             indexFrameEnd = tmpRaw.index("</frame>") + len("</frame>")
             tmpFrameRaw = tmpRaw[:indexFrameEnd]
             tmpRaw = tmpRaw[indexFrameEnd+1:]
             
             #Set raw content
             dd.set_raw(tmpFrameRaw)
             
             #Update dd list
             tmp_objs.append(frame)
             
             #Update content
             tmp_content += frame.get_content() + "\n"
         else:
             break