def saveUrls(self, dir, reportUrl, detailUrl): filename = os.path.join(dir, "urls.txt") if (not os.path.isfile(filename)): FileHelper.saveToFile(filename, '%s\r\b%s' % (reportUrl, detailUrl))
def saveProfile(self, profileName, dir, reportUrl, detailUrl, overallEntry): assetdir = os.path.join(dir, "files" + os.sep) if (not os.path.isdir(dir)): os.makedirs(dir) if (not os.path.isdir(assetdir)): os.makedirs(assetdir) status, page = self.downloader.download_page(reportUrl, dir, assetdir, '%s_origin.htm' % (profileName), css=False, javascript=False, image=False) #self.downloader.clear_cache() if (page != None): reporter = None reportContent = "" #headers items = page.xpath(u"//*[@id='maincontent']//article/header/hgroup/*") for item in items: header = StrHelper.trim(item.text_content()) if (header != None and header.startswith(profileName)): header = StrHelper.trim(header[len(profileName):]) reportContent += header + os.linesep break reportContent += os.linesep #content reg = re.compile(ur"^基金會編號.*$", re.MULTILINE) allsymbols = ur" ,、。.?!~$%@&#*‧;︰…‥﹐﹒˙·﹔﹕‘’“”〝〞‵′〃├─┼┴┬┤┌┐╞═╪╡│▕└┘╭╮╰╯╔╦╗╠═╬╣╓╥╖╒╤╕║╚╩╝╟╫╢╙╨╜╞╪╡╘╧╛﹣﹦≡|∣∥–︱—︳╴¯ ̄﹉﹊﹍﹎﹋﹌﹏︴﹨∕╲╱\/↑↓←→↖↗↙↘〔〕【】﹝﹞〈〉﹙﹚《》(){}﹛﹜『』「」<>≦≧﹤﹥︵︶︷︸︹︺︻︼︽︾︿﹀∩∪﹁﹂﹃﹄" regReporters = [ #re.compile(ur"[。:」\s]+(.{3,4})口述.?記者(.{3,4})(?:採訪整理)?$", re.MULTILINE), re.compile(allsymbols + ur"[\s]+(.{2,4})[口筆]述\s?.?\s?記者(.{2,4})(?:採訪整理)?$", re.MULTILINE), #[\u4e00-\u9fa5] 英文字符之外的字符,包括中文漢字和全角標點 re.compile(ur"報導.攝影.(.{2,4})記者$", re.MULTILINE), re.compile(ur"報導.攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.報導.(.{2,4})$", re.MULTILINE), re.compile(ur"攝影.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"報導.(.{2,4})$", re.MULTILINE), re.compile(ur"記者(.{2,4})採訪整理$", re.MULTILINE), re.compile(ur"^【(.{2,4})╱.{2,4}報導】", re.MULTILINE), ] #preserve <br> tags as \n brs = page.xpath(u"//div[@class='articulum']//br") if (len(brs) == 0): brs = page.xpath(u"//div[@class='articulum trans']//br") for br in brs: br.tail = "\n" + br.tail if br.tail else "\n" items = page.xpath(u"//div[@class='articulum']/*") if (len(items) == 0): items = page.xpath(u"//div[@class='articulum trans']/*") for item in items: tag = item.tag.lower() id = self.get_attrib(item, "id", None) # if (tag == "figure"): continue # if (tag == "iframe"): break if (id == "bcontent" or id == "bhead" or id == "introid"): text = StrHelper.trim(item.text_content()) if (text == None or text == ""): continue if (id != "bhead"): for regReporter in regReporters: list = regReporter.findall(text) if (len(list) == 1): if (not isinstance(list[0], basestring)): reporter = "/".join(list[0]) else: reporter = list[0] text = StrHelper.trim(regReporter.sub('', text)) break if (reporter): overallEntry.reporter = reporter else: self.logger.warn("error: parsing reporter: %s" % reportUrl) text = StrHelper.trim(reg.sub('', text)) reportContent += text + os.linesep + os.linesep FileHelper.saveToFile(os.path.join(dir, reportFileName), reportContent) status, page = self.downloader.download_page(detailUrl, dir, assetdir, detailSrcFileName, css=False, javascript=False, image=False) if (page != None): items = page.xpath(u"//div[@id='charitysidebox3'][1]/div[@id='inquiry3']/table//tr") maxDate = None if (len(items) > 0): file = None try: file = open(os.path.join(dir, detailFileName), "wb") csvwriter = csv.writer(file) for index, item in enumerate(items): if (index > 1): cols = item.xpath(u".//td") if (len(cols) == 4): no = StrHelper.trim(cols[0].text) name = StrHelper.trim(cols[1].text) amount = StrHelper.trim(cols[2].text) dateStr = StrHelper.trim(cols[3].text) try: date = datetime.datetime.strptime(dateStr, "%Y/%m/%d") if (maxDate == None or date > maxDate): maxDate = date except Exception as ex: self.logger.warn("error date format:%s in %s" % (dateStr, detailUrl)) csvwriter.writerow([no, dateStr, amount, name]) overallEntry.enddate = maxDate.strftime("%Y/%m/%d") if maxDate != None else "" overallEntry.doners = len(items) - 2 except Exception as ex: self.logger.exception(LogHelper.getExceptionMsg(ex, "error paring detail.html")) finally: if (file): file.close()