def run(self, oxtFile): for file in expectedFiles: file = oxtExtract + "/" + file if not self.__fs.exists(file): print "Error: %s was not found." % file if not self.__fs.exists(oxtFile): print "Warning: %s not found. created instead." % oxtFile self.__fs.zip(oxtFile, oxtExtract) for toolbarTextFile, toolbarXbaFile in toolbarTextFiles.items(): toolbarText = self.__fs.readFile(toolbarTextFile) if toolbarText is None: print "File '%s' not found!" % self.__fs.absolutePath(toolbarTextFile) return #check if there weird char in the content. If there is fixed it otherwise it will break the code in the toolbar installer. try: toolbarText = toolbarText.encode("utf-8") except Exception,e: print "fileName : ", toolbarTextFile print "error in string: ",str(e) newText = "" t = toolbarText for c in t: newC = c if ord(c)>127: print "error char : ",c newC = "&#%s;" % ord(c) newText += newC toolbarText = newText xml = xml_util.xml(xbaXmlWrapper) xml.getRootNode().setContent(toolbarText) xmlStr = str(xml) xml.close() self.__fs.addToZipFile(oxtFile, toolbarXbaFile, xmlStr)
def test_no_language(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/1/marc.xml") tag = createTag(input) testDate3 = getTestDate() outputTag = testDate3 + "t2002||||||| eng|d" assert tag == outputTag
def obtain_files(dspace_archive, filename): if os.path.exists(dspace_archive): archive = dspaceArchive(dspace_archive) #loop through items for item in archive.items: print "Processing Item " + item.name fullpath = os.path.join(item.dir, filename) xml = xml_util.xml(fullpath) fileList = [] #match all urls nodes = xml.getNodes("//text()[contains(.,'http')]" and "//text()[contains(.,'http')]") #qualify urls for node in nodes: splitNode = node.content splitNode = splitNode.split(";") url = splitNode[1] fileList.append(url) #get urls for urlItem in fileList: harvestUrl = cleanUrl(urlItem) print urlItem content = getFile(harvestUrl) if content != None: harvestedFile = urlItem.rsplit("/")[-1] item.newStream(harvestedFile, "bundle:ORIGINAL", content) else: print "no content" xml.close()
def fetchXml(htmlFile): createdXml = """<?xml version="1.0"?><collection>""" parsedXml = xml_util.xml(createdXml) #readFile = getFile(htmlFile) webPageXml = xml_util.xml(htmlFile) nodes = webPageXml.getNodes("//b") print "********************************" for node in nodes: newElement = parsedXml.createElement( repr(node.getContent()), repr(node.getNextSibling().serialize())) parsedXml.addChild(newElement) file = open("temp.xml", "w") file.write(parsedXml.serialize()) return parsedXml
def test_htmlToXML(): targetFile = open("xml_data/meta_lowercase.xml", 'r') target = targetFile.read() htmlFile = open("html_data/thompson-index.html", 'r') html = htmlFile.read() testThis = xml_util.xml(target) result = htmlToXML(html) assert diff_util.sameXml(target, result)
def test_bodyHtmlToXML(): targetBodyFile = open("xml_data/body_item.xml", 'r') targetBody = targetBodyFile.read() htmlFile2 = open("html_data/thompson-index.html", 'r') bodyHtml = htmlFile2.read() testThis2 = xml_util.xml(targetBody) resultBody = bodyHtmlToXML(bodyHtml) assert diff_util.sameXml(targetBody,resultBody)
def test_little_language(): input = xml_util.xml( '<datafield tag="041" ind1=" " ind2=" "><subfield code="a">f</subfield></datafield>' ) outputLanguage = getLanguage(input) while len(outputLanguage) < 3: # make it 3 characters long outputLanguage = outputLanguage + " " assert outputLanguage == "f "
def getMissingDatastreams(file, datastreamId): xpath = "//*[local-name()='datastream'][@ID='%s']" % (datastreamId) #xpath = "//*[local-name()='datastream'][@ID='FULLTEXT']" print xpath xml = xml_util.xml(file) dsList = xml.getNodes(xpath) for d in dsList: print d return dsList
def test_getPublicationDate5(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/2/marc4.xml") date = getPublicationDate(input) date2 = getAlternateDate(input) date046 = getPublicationDate046(input) assert date == "1260" assert date2 == "1963" assert date046 == "1952"
def cleanTitle(input): xml = xml_util.xml(input) titleXml = xml.getNodes( "//*[local-name()='datafield'][@tag='245']/*[local-name()='subfield'][@code='a']" ) for node in titleXml: print node.content titleString = node.content titleString = titleString.rstrip(":") node.setContent(titleString) xml.saveFile(input) xml.close()
def cleanKeyword(input): xml = xml_util.xml(input) keywordXml = xml.getNodes( "//*[local-name()='datafield'][@tag='650']/*[local-name()='subfield'][@code='x']" ) for node in keywordXml: print node.content string1 = node.content cleanString = string.replace(string1, "]", "") cleanString2 = string.replace(cleanString, ".", "") node.setContent(cleanString2) xml.saveFile(input) xml.close()
def getIdentifier(dcString): dcxml = xml_util.xml(dcString,[("dc","http://purl.org/dc/elements/1.1/"),("xsi","http://www.w3.org/2001/XMLSchema-instance")]) try: pageUrl = dcxml.getNode("//dc:identifier").content return pageUrl #[@xsi:type='dcterms:URI'] except Exception, errorInfo: print errorInfo print "Unable to find contents" return None
def iterate(archiveName): arc = dspaceArchive(archiveName) for item in arc.items: print item.name + " is being processed" #fileContents = item.readFile("marc.xml") x = item.getRelPathToStream('marc.xml') fullPath = os.path.join(archiveName, x) print fullPath input = xml_util.xml(fullPath) print input tag = createTag(input) node = input.getNode("//*[local-name()='controlfield'][@tag='008']") node.setContent(tag) input.saveFile(fullPath) print item.name + " controlfield tag 008, update complete" input.close() print "Creating of marc controlfield tag[s] is complete"
def addMarcTag(input, recordType): file = open(input, 'rb') readFile = file.read() splitMarc = readFile.split("</record>") marcString = None if recordType == "B": marcString = """<datafield tag="655" ind1=" " ind2="7"> <subfield code="a">Brunner digitised document</subfield> <subfield code="2">local</subfield> </datafield> <datafield tag="540" ind1=" " ind2=" "> <subfield code="a">PART III. After reasonable investigation, this material has been reproduced in reliance on Part III of the Australian Copyright Act 1968. The electronic form of this material is Copyright Macquarie University, Sydney. Please contact the Macquarie University Copyright Unit with inquiries www.copyright.mq.edu.au</subfield> </datafield>""" if recordType == "E": tagPresent = False xml = xml_util.xml(input) nodes = xml.getNodes( "//*[local-name()='datafield'][@tag='655']/*[local-name()='subfield'][@code='a']" ) for node in nodes: if (node.getContent == "Australasian Digital Thesis") != -1: tagPresent = True if tagPresent == False: marcString = """<datafield tag="655" ind1=" " ind2=" "> <subfield code="a">Australasian Digital Thesis</subfield> </datafield>""" xml.close() if recordType == "W": marcString = """<datafield tag="540" ind1=" " ind2=" "> <subfield code="a">Permission for use provided to the Macquarie University Digital Repository by the publisher.</subfield> </datafield>""" if recordType == "L": marcString = """<datafield tag="540" ind1=" " ind2=" "> <subfield code="a">*</subfield> </datafield>""" if marcString != None: builtString = splitMarc[0] + marcString + "</record>" + splitMarc[1] newFile = open(input, 'wb') newFile.write(builtString) newFile.close() file.close()
def obtain_files(dspace_archive, filename, fileType, protocol="false", username="******", password="******"): if os.path.exists(dspace_archive): archive = dspaceArchive(dspace_archive) #loop through items for item in archive.items: print "Processing Item " + item.name fullpath = os.path.join(item.dir, filename) xml = xml_util.xml(fullpath) fileList = [] #match all urls nodes = xml.getNodes("//text()[starts-with(.,'http')]") #qualify urls for node in nodes: if node.content.endswith(fileType): fileList.append(node.content) #get urls for urlItem in fileList: harvestUrl = cleanUrl(urlItem) if protocol != "false": Content = getFile(harvestUrl, protocol, username, password) else: Content = getFileNoAuth(harvestUrl) if Content != None: pdfFileName = get_harvestedFileName(urlItem) item.newStream(pdfFileName, "bundle:ORIGINAL", Content) else: print "no content" xml.close()
def removeXmlNode(dspaceArchiveName, filename, xpath): print dspaceArchiveName if os.path.exists(dspaceArchiveName): arc = dspaceArchive(dspaceArchiveName) for item in arc.items: print "Processing " + item.name + "." filePath = os.path.join(item.dir, filename) print filePath xml = xml_util.xml(filePath) nodeToDelete = xml.getNodes(xpath) for node in nodeToDelete: nodeContent = node.getContent() if nodeContent.find("/public/") != -1 or nodeContent.find( "ethesis.php") != -1: try: node.delete() print "Successfully removed node" xml.saveFile() except Exception, errorInfo: print errorInfo print "The xpath " + xpath + " did not match a node in " + filePath + "." print "Processing complete." xml.close()
def test_getPublicationDate6(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/2/marc5.xml") date = getAlternateDate(input) assert date == "1963"
def test_createTag2(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/2/marc4.xml") tag = createTag(input) testDate2 = getTestDate() assert tag == testDate2 + "m12601963||| eng|d"
def test_createTag3(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/2/marc2.xml") tag1 = createTag(input) testDate3 = getTestDate() assert tag1 == testDate3 + "n||||||||||| eng|d"
def test_createTag(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/0/marc.xml") tag = createTag(input) testDate2 = getTestDate() assert tag == testDate2 + "t2002||||||| fr |d"
def obtainFiles(dspace_archive, filename, pathToDeadLinksFile, protocol="false", username="******", password="******"): #create new file for dead links downloadCounter = 0 deadLinkReport = DeadLinksFile(pathToDeadLinksFile) if os.path.exists(dspace_archive): archive = dspaceArchive(dspace_archive) #loop through items for item in archive.items: print "Processing Item " + item.name fullPath = os.path.join(item.dir, filename) xml = xml_util.xml(fullPath) fileList = [] #match all urls nodes = xml.getNodes("//text()[contains(.,'http')]") sessionNode = xml.getNode("//session") sessionNodeContent= sessionNode.content #qualify urls and store in list for node in nodes: if node.content.startswith("http://") or node.content.startswith("https://"): fileList.append(node.content) else: deadLinkReport.reportToScreen(node.content) deadLinkReport.addDeadLinkT(node.content) print "\n\n" #iterate through list of urls for urlItem in fileList: harvestUrl = cleanUrl(urlItem) #determine if link is a downloadable non xml datastream or just a html page isHtml = "false" isHtml = determineMimeType(harvestUrl) content = None if isHtml == "false": #determine if user has entered authentication for downloading datastreams using basic auth if protocol != "false": content= getFile(harvestUrl, protocol, username, password) else: if harvestUrl.startswith("https://"): content= getFileHttps(harvestUrl) else: content= getFileNoAuth(harvestUrl) else: content = getRedirectedFile(harvestUrl) if content!= None: harvestedFileName = getHarvestedFileName(harvestUrl) #shorten filename so that file system does not complain correctlySizedFileName = shortenFileName(harvestedFileName) #add datastream to dspace archive item.newStream(correctlySizedFileName, "bundle:ORIGINAL", content) #append valet xml so that it knows about the new datastream addAttachmentDataToValetXml(xml, correctlySizedFileName, fullPath) #increment the downloadCounter downloadCounter = downloadCounter + 1 else: deadLinkReport.reportToScreen(harvestUrl) deadLinkReport.addDeadLinkT(harvestUrl) print "\n\n" xml.close() deadLinkReport.report(sessionNodeContent, item.name) deadLinkReport.reset() print "\n\n" deadLinkReport.closeFile() print "total number of downloads = " print downloadCounter
def test_getPublicationDate4(): input = xml_util.xml( "create_marc_controlfield_tag_008_test_archive/2/marc3.xml") date = getPublicationDate(input) assert date == "1234"
xsl/marc_dc.xsl dublin_core.xml dspaceArchive False """ import libxml2, urllib2, urlparse, sys, os, os.path, subprocess, re, unicodedata sys.path.append("utils") sys.path.append("dspace_archive") import diff_util, xml_util, xslt_util from dspace_archive import * #load existing Archive inputFileName = sys.argv[1] XslFilePath = sys.argv[2] outputFileName = sys.argv[3] TargetArchiveName = sys.argv[4] removeInputFileAfterTransform = sys.argv[5] archive = dspaceArchive(TargetArchiveName) for item in archive.items: print "Processing item: " + item.name metaString = item.readFile(inputFileName) meta = xml_util.xml(metaString) xslt = xslt_util.xslt(XslFilePath) temp = meta.applyXslt(xslt) dublinCore = str(temp) temp.close() meta.close() xslt.close() item.setDublinCoreStream(dublinCore, inputFileName, outputFileName, removeInputFileAfterTransform) print "Transformation complete"