def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) _, errorlog = tempfile.mkstemp(prefix="siskin-") stylesheet = self.input().get("stylesheet").path size = wc(self.input().get("filelist").path) with self.input().get("filelist").open() as handle: for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1): basename = os.path.basename(row.path) name = basename.replace(".xml", ".marcxml") destination = os.path.join(target, name) if not os.path.exists(destination): try: output = shellout("xsltproc {xsl} {input} > {output}", input=row.path, xsl=stylesheet) luigi.File(output).move(destination) except RuntimeError as err: self.logger.error("{0}: {1}".format(row.path, err)) with open(errorlog, "a") as log: log.write("%s\t%s\n" % (row.path, err)) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) # write receipt with self.output().open("w") as output: for path in iterfiles(target): output.write_tsv(path) # this is just a temporary artefact for now self.logger.debug("Conversion errors logged at: {0}".format(errorlog))
def run(self): # create target subdirectory target = os.path.join(os.path.dirname(self.output().path), str(self.closest())) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=("url",)), start=1): name = os.path.join(target, row.url.split("/")[-2]) destination = "{name}.xml".format(name=name) if not os.path.exists(destination): output = shellout( """wget -q --retry-connrefused {url} -O {output}""", url=row.url, ) luigi.File(output).move(destination) self.logger.debug("{0}/{1} {2}".format(i, size, row.url)) # write "receipt" with self.output().open("w") as output: for path in iterfiles(target): if path.endswith(".xml"): output.write_tsv(path)
def run(self): target = os.path.join(os.path.dirname(self.output().path), str(self.date)) if not os.path.exists(target): os.makedirs(target) size = wc(self.input().path) _, combined = tempfile.mkstemp(prefix="siskin-") with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=("path",)), start=1): # Cleanup wrongly nested data fields, see: # https://gist.github.com/miku/ea779a221d00b5524fcd # in 2014-05, this corrects 673 errors, while 31 are not yet # recoverable! with open(row.path) as handle: f = cStringIO.StringIO(handle.read()) doc = etree.parse(f) result = doc.xpath( "/marc:record/marc:datafield/marc:datafield", namespaces={"marc": "http://www.loc.gov/MARC21/slim"} ) if len(result) > 0: self.logger.debug("Fixing broken MARCXML in: {0}".format(row.path)) for misplaced in result: parent = misplaced.getparent() record = misplaced.getparent().getparent() parent.remove(misplaced) record.append(misplaced) _, cleaned = tempfile.mkstemp(prefix="siskin-") with open(cleaned, "w") as output: output.write(etree.tostring(doc, pretty_print=True)) # actually do the conversion ... basename = os.path.basename(row.path) name = basename.replace(".marcxml", ".mrc") destination = os.path.join(target, name) if not os.path.exists(destination): # exit(5) for serious decoding errors # see: http://www.indexdata.com/yaz/doc/NEWS shellout( """yaz-marcdump -i marcxml -o marc {input} >> {output}""", input=cleaned, output=combined, ignoremap={5: "FIXME"}, ) self.logger.debug("{0}/{1} {2}".format(i, size, row.path)) luigi.File(combined).move(self.output().path)