def doWork(self, root, fileName): if not fileName.endswith(".xml"): return srcFile = root + "/" + fileName resultFilePath = srcFile if self.dataFileExists(fileName, srcFile): soup = self.integrateParentWithData(fileName, srcFile) else: xmlDataFile = codecs.open(srcFile, "r", "utf-8") xmlData = xmlDataFile.read() xmlData = html.unescape_string(xmlData) xmlDataFile.close() soup = BeautifulSoup(xmlData, "lxml") soup = self.semantify(soup, resultFilePath) # 最后做断句处理 divider = Divider(soup, self.config_file_path) soup = divider.doWork() resultFile = codecs.open(resultFilePath, "w", "utf-8") resultFile.write(self.beautiful_soup_tag_to_unicode(soup)) resultFile.close() self.count += 1 print "Processed: %d" % self.count
def integrateParentWithData(self, fileName, parentFile): dataFile = self.getDataFilePathForFileName(fileName) data = codecs.open(dataFile, "r", "utf-8") dataContent = data.read() data.close() dataContent = html.unescape_string(dataContent) # get rid of something like "&nbsp;" # &nbsp; => => " " dataContent = html.unescape_string(dataContent) dataContent = dataContent.replace("<o:p>", "<p>").replace("</o:p>", "</p>") # dataContent = dataContent.replace("<st1:", "<!--<st1:").replace("st1:chsdate>", "st1:chsdate>-->").replace("st1:chmetcnv>", "st1:chmetcnv>-->").replace("st1:personname>", "st1:personname>-->") # 20130327 fix#374 str_result = dataContent r = re.compile("xml:namespace prefix = (.*?) ns") s_match = r.findall(str_result) for c in s_match: namespace_list = re.findall("<" + c + ":.*?>", str_result) for namespace_r in namespace_list: str_result = str_result.replace(namespace_r, "") namespace_list = re.findall("</" + c + ":.*?>", str_result) for namespace_r in namespace_list: str_result = str_result.replace(namespace_r, "") namespace_list = re.findall("<\?xml:namespace prefix.*?>", str_result) for namespace_r in namespace_list: str_result = str_result.replace(namespace_r, "") dataContent = str_result parent = codecs.open(parentFile, "r", "utf-8") parentContent = parent.read() parentContent = html.unescape_string(parentContent) parent.close() dataSoup = BeautifulSoup(dataContent, "lxml") parentSoup = BeautifulSoup(parentContent, "lxml") dataSoup.article.insert(0, parentSoup.parentpageurl) return dataSoup