def get(self): if "id" in self.complaint and self.complaint["id"]: self.complaint["year"] = int("20" + self.complaint["id"][0:2]) docType = self.docType(self.complaint["year"]) docfilename = download.getFilename(self.complaint["year"], "docs", self.complaint["id"], "." + docType) textfilename = download.getFilename(self.complaint["year"], "text", self.complaint["id"], ".txt") htmlfilename = download.getFilename(self.complaint["year"], "html", self.complaint["id"], ".html") #download.getFile("/".join(("http://203.152.114.11/decisions", self.complaint["id"][0:2], self.complaint["id"] + "." + docType)), docfilename, self.refresh, returnfile = False) download.getFile("http://old.asa.co.nz/decision_file.php?ascbnumber=" + self.complaint["id"], docfilename, self.refresh, returnfile = False) if docType == "doc": self.__getPage(download.getFile("http://old.asa.co.nz/display.php?ascb_number=" + self.complaint["id"], download.getFilename(self.complaint["year"], 'pages', self.complaint["id"]), self.refresh)) for field in ["docdate", "docsize", "docwords", "complainants", "companies", "meetingdate"]: self.complaint[field] = None self.complaint["docdate"], self.complaint["docsize"] = download.getFileDetails(docfilename) if os.path.exists(docfilename): if os.path.exists(textfilename) and (self.complaint["docdate"] == os.path.getmtime(textfilename) or self.quick): self.complaint["doc"] = download.loadResource(textfilename) else: if docType == "doc": command = [antiword, "-w", "0", "-m", "8859-1.txt"] else: command = [unrtf, "--text"] command.append(os.path.join(os.getcwd(), docfilename)) try: self.complaint["doc"] = subprocess.check_output(command).decode('unicode_escape') except Exception, e: print "Failed to convert doc " + os.path.join(os.getcwd(), docfilename) + ": " + str(e) else: if docType == "rtf": self.complaint["doc"] = self.complaint["doc"].split("-----------------", 1)[1] download.saveResourcePost(textfilename, self.complaint["doc"], (time.time(), self.complaint["docdate"])) if os.path.exists(htmlfilename) and (self.complaint["docdate"] == os.path.getmtime(htmlfilename) or self.quick): self.complaint["html"] = download.loadResource(htmlfilename) else: command2 = [unoconv, "-f", "html", "-o", os.path.join(os.getcwd(), htmlfilename), os.path.join(os.getcwd(), docfilename)] print "Saving: " + htmlfilename subprocess.check_output(command2) self.complaint["html"] = re.search('<body(?:.*?)>(.*?)</body>', download.loadResource(htmlfilename), flags=re.DOTALL).group(1)
def getList(self, folder, filename): return download.loadResource(os.path.join(folder, filename)).decode('unicode_escape').splitlines()