def process_document(self, session, doc): cmd = "file -i -b %INDOC%" (qq, infn) = tempfile.mkstemp() os.close(qq) fh = open(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) res = getShellResult(cmd) mt = res.strip() if mt.find(';') > -1: bits = mt.split(';') mt = bits[0] for b in bits[1:]: # just stuff them on doc for now (type, value) = b.split('=') setattr(doc, type, value) if mt == "text/plain": # Might be sgml, xml, text etc res = getShellResult("file -b {0}".format(infn)) mt2 = res.strip() if mt2 == "exported SGML document text": mt = "text/sgml" elif mt2 == "XML document text": mt = "text/xml" # Others include java, etc. but not very useful to us doc.mimeType = mt doc.processHistory.append(self.id) return doc
def process_document(self, session, doc): cmd = "file -i -b %INDOC%" (qq, infn) = tempfile.mkstemp() os.close(qq) fh = open(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) res = getShellResult(cmd) mt = res.strip() if mt.find(';') > -1: bits = mt.split(';') mt = bits[0] for b in bits[1:]: # just stuff them on doc for now (type, value) = b.split('=') setattr(doc, type, value) if mt == "text/plain": # we might be sgml, xml, text etc res = getShellResult("file -b {0}".format(infn)) mt2 = res.strip() if mt2 == "exported SGML document text": mt = "text/sgml" elif mt2 == "XML document text": mt = "text/xml" # others include java, etc. but not very useful to us doc.mimeType = mt return doc
def find_documents(self, session, cache=0): fl = getShellResult("locate %s | grep %s$" % (self.stream, self.stream)) docs = fl.split('\n') while docs and docs[0][:8] == "warning:": docs.pop(0) self._processFiles("", docs, cache)
def __init__(self, session, node, parent): tp = self.get_path(session, 'executablePath', '') exe = self.get_path(session, 'executable', 'enju') if not tp: tp = getShellResult('which %s' % exe) tp = tp if not tp.startswith('which:') else exe else: tp = os.path.join(tp, exe) xml = self.get_setting(session, 'xml', 1) if xml: cmd = "%s -xml" % tp else: cmd = tp self.pipe = Popen(cmd, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) l = "" while l != 'Ready\n': # Check for errors with command if "command not found" in l: self.log_error( session, "Error while initializing EnjuObject: " "{0}".format(l.strip())) break l = self.pipe.stderr.readline()
def __init__(self, session, node, parent): self.unparsedOutput = self.get_setting(session, 'parseOutput', 0) tp = self.get_path(session, 'executablePath', '') exe = self.get_path(session, 'executable', 'geniatagger') if not tp: tp = getShellResult('which %s' % exe) tp = os.path.dirname(tp) tpe = os.path.join(tp, exe) if not tp: raise ConfigFileException("%s requires the path: " "executablePath" % self.id) o = os.getcwd() os.chdir(tp) if self.get_setting(session, 'tokenize', 0): cmd = exe else: cmd = "%s -nt" % exe self.pipe = Popen(cmd, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) l = "" while l != 'loading named_entity_models..done.\n': l = self.pipe.stderr.readline() os.chdir(o)
def __init__(self, session, node, parent): tp = self.get_path(session, 'executablePath', '') exe = self.get_path(session, 'executable', 'enju') if not tp: tp = getShellResult('which %s' % exe) tp = tp if not tp.startswith('which:') else exe else: tp = os.path.join(tp, exe) xml = self.get_setting(session, 'xml', 1) if xml: cmd = "%s -xml" % tp else: cmd = tp self.pipe = Popen(cmd, shell=True, bufsize=1, stdin=PIPE, stdout=PIPE, stderr=PIPE) l = "" while l != 'Ready\n': # Check for errors with command if "command not found" in l: self.log_error(session, "Error while initializing EnjuObject: " "{0}".format(l.strip())) break l = self.pipe.stderr.readline()
def process_document(self, session, doc): """Pass Document to executable, add results to document metadata.""" cmd = self.cmd stdIn = cmd.find('%INDOC%') == -1 stdOut = cmd.find('%OUTDOC%') == -1 if not stdIn: if doc.mimeType or doc.filename: # guess our extn~n try: suff = mimetypes.guess_extension(doc.mimeType) except: suff = '' if not suff: suff = mimetypes.guess_extension(doc.filename) if suff: (qq, infn) = tempfile.mkstemp(suff) else: (qq, infn) = tempfile.mkstemp() else: (qq, infn) = tempfile.mkstemp() os.close(qq) fh = file(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) if not stdOut: if self.outMimeType: # guess our extn~n suff = mimetypes.guess_extension(self.outMimeType) (qq, outfn) = tempfile.mkstemp(suff) else: (qq, outfn) = tempfile.mkstemp() cmd = cmd.replace("%OUTDOC%", outfn) os.close(qq) if self.working: old = os.getcwd() os.chdir(self.working) else: old = '' if stdIn: pipe = subprocess.Popen(cmd, bufsize=0, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pipe.stdin.write(doc.get_raw(session)) pipe.stdin.close() result = pipe.stdout.read() pipe.stdout.close() pipe.stderr.close() del pipe else: # result will read stdout+err regardless result = getShellResult(cmd) os.remove(infn) if not stdOut: if os.path.exists(outfn) and os.path.getsize(outfn) > 0: ofh = open(outfn) else: # command probably added something to the end # annoying matches = glob.glob(outfn + "*") for m in matches: if os.path.getsize(m) > 0: ofh = open(m) break result = ofh.read() ofh.close() os.remove(outfn) # strip input filename from result if present (this is a tempfile so the name is useless) if result.startswith(infn): result = re.sub('^%s\s*[:-]?\s*' % (infn), '', result) if old: os.chdir(old) try: doc.metadata[self.metadataType].update( self._processResult(session, result)) except: doc.metadata[self.metadataType] = self._processResult( session, result) if 'analysisDateTime' not in doc.metadata[self.metadataType]: doc.metadata[self.metadataType][ 'analysisDateTime'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z') return doc
def __init__(self, session, config, parent=None): """Constructor inherited by all configured Cheshire3 objects. The constructor for all Cheshire3 objects take the same arguments: session: A Session object topNode: The <config> or <subConfig> domNode for the configuration parent: The object that provides the scope for this object. """ self.docstring = "" self.parent = parent self.subConfigs = CaselessDictionary() self.paths = {} self.objects = CaselessDictionary() self.settings = {} self.defaults = {} self.permissionHandlers = {} self.unresolvedObjects = {} self.functionLogger = None self._objectRefs = [] self._includeConfigStores = [] self.logger = None self.checkSums = {} self.pathCheckSums = {} self.version = "" self.complexity = "" self.stability = "" self.initTime = time.time() pathObjects = {} # LXML if hasattr(config, 'attrib'): self.id = config.attrib.get('id', '') self.version = config.attrib.get('version', '') self.complexity = config.attrib.get('complexity', '') self.stability = config.attrib.get('stability', '') walker = config.iterchildren(tag=etree.Element) for e in walker: if e.tag in ['name', '{%s}name' % CONFIG_NS]: self.name = e.text elif e.tag in ['objectType', '{%s}objectType' % CONFIG_NS]: self.objectType = e.text elif e.tag in ['checkSums', '{%s}checkSums' % CONFIG_NS]: for e2 in e.iterchildren(tag=etree.Element): # Store checksum on self, and hash code against it pt = e2.attrib.get('pathType', '__code__') ct = e2.attrib.get('type', 'md5') if pt != '__code__': try: self.pathCheckSums[pt].append((ct, e2.text)) except KeyError: self.pathCheckSums[pt] = [(ct, e2.text)] else: self.checkSums[ct] = e2.text elif e.tag in ['paths', '{%s}paths' % CONFIG_NS]: for e2 in e.iterchildren(tag=etree.Element): try: typ = e2.attrib['type'] except KeyError: raise ConfigFileException("path must have type") if e2.tag in ['path', '{%s}path' % CONFIG_NS]: # Allow template strings in paths # e.g. ${cheshire3Home}/foo/bar pathTmpl = Template(e2.text) sub = pathTmpl.safe_substitute self.paths[typ] = sub(cheshire3Paths) elif e2.tag in ['object', '{%s}object' % CONFIG_NS]: try: ref = e2.attrib['ref'] except KeyError: msg = "object must have ref" raise ConfigFileException(msg) pathObjects[typ] = ref elif e.tag in ['subConfigs', '{%s}subConfigs' % CONFIG_NS]: # Recurse self._recurseLxmlSubConfigs(session, e) elif e.tag in ['options', '{%s}options' % CONFIG_NS]: for e2 in e.iterchildren(tag=etree.Element): try: typ = e2.attrib['type'] except KeyError: msg = "option (setting/default) must have type" raise ConfigFileException(msg) if e2.tag in ['setting', '{%s}setting' % CONFIG_NS]: value = self._verifySetting(typ, e2.text) self.settings[typ] = value elif e2.tag in ['default', '{%s}default' % CONFIG_NS]: value = self._verifyDefault(typ, e2.text) self.defaults[typ] = value elif e.tag in ['actions', '{%s}actions' % CONFIG_NS]: pass elif e.tag in ['docs', '{%s}docs' % CONFIG_NS]: self.docstring = e.text else: self._handleLxmlConfigNode(session, e) del walker else: if (config.hasAttributeNS(None, 'id')): self.id = config.getAttributeNS(None, 'id') for child in config.childNodes: if child.nodeType == elementType: if child.localName == "name": self.name = getFirstData(child) elif (child.localName == "objectType"): self.objectType = getFirstData(child) elif (child.localName == "paths"): # Configure self with paths for child2 in child.childNodes: if child2.nodeType == elementType: type = child2.getAttributeNS(None, 'type') if child2.localName == "path": value = getFirstData(child2) # Allow template strings in paths # e.g. ${cheshire3Home}/foo/bar pathTmpl = Template(value) sub = pathTmpl.safe_substitute self.paths[type] = sub(cheshire3Paths) elif child2.localName == "object": value = child2.getAttributeNS(None, 'ref') pathObjects[type] = value elif (child.localName == "subConfigs"): # Pointers to dom nodes for config ids self._recurseSubConfigs(session, child) elif (child.localName == "objects"): for obj in child.childNodes: if ( obj.nodeType == elementType and obj.localName == "path" ): type = obj.getAttributeNS(None, 'type') id = obj.getAttributeNS(None, 'ref') self._objectRefs.append((id, type)) elif (child.localName == "options"): # See configInfo in ZeeRex for child2 in child.childNodes: if (child2.nodeType == elementType): type = child2.getAttributeNS(None, 'type') if (child2.localName == "setting"): dc = getFirstData(child2) if (dc): value = self._verifySetting(type, dc) self.settings[type] = value elif (child2.localName == "default"): dc = getFirstData(child2) if (dc): value = self._verifyDefault(type, dc) self.defaults[type] = value elif (child.localName == "actions"): # Permission rqmts for child2 in child.childNodes: if child2.nodeType == elementType: p = PermissionHandler(child2, self) self.permissionHandlers[p.actionIdentifier] = p elif (child.localName == "docs"): # Add per configuration documentation to docs stack. self.docstring = getFirstData(child) else: self._handleConfigNode(session, child) if ('pythonPath' in self.paths): sys.path.append(self.paths['pythonPath'][1]) # Allow any object to be set to debug # Functionality of this dependent on object self.debug = self.get_setting(session, "debug", 0) for p in self.permissionHandlers.keys(): if p[0:5] == 'c3fn:': self.add_auth(p[5:]) # Dynamically Instantiate objects. This is mindbending :} # Mindbending2: JIT building! if self.parent: self.parent.objects[self.id] = self for o in (self._objectRefs): # Instantiate obj = self.get_object(session, o[0]) # Add default Object types to paths for t in pathObjects.keys(): self.unresolvedObjects[t] = pathObjects[t] # Built, maybe set function logging log = self.get_setting(session, 'log', session.server.defaultFunctionLog) if log: fl = self.get_path(session, 'functionLogger') if fl != self: self.functionLogger = fl logList = log.strip().split() for l in logList: self.add_logging(session, l) try: del self.settings['log'] except KeyError: # from default pass # now checksum self if self.checkSums: code = inspect.getsource(self.__class__) for (ct, val) in self.checkSums.items(): m = hashlib.new(ct) m.update(code) digest = m.hexdigest() if digest != val: raise IntegrityException(self.id + ": " + digest) if self.pathCheckSums: # step through each referenced file and check for (pt, chk) in self.pathCheckSums.items(): for (ct, val) in chk: m = hashlib.new(ct) # read in file fn = self.get_path(session, pt) if not os.path.isabs(fn): if pt == 'executable': # search dp = self.get_path('session', 'executablePath', '') if not dp: dp = getShellResult('which {0}'.format(fn)) else: dp = self.get_path(session, 'defaultPath') fn = os.path.join(dp, fn) fh = file(fn) data = fh.read() fh.close() m.update(data) digest = m.hexdigest() if digest != val: msg = "%s/%s (%s): %s" % (self.id, pt, fn, digest) raise IntegrityException(msg)
def process_document(self, session, doc): """Pass Document to executable, add results to document metadata.""" cmd = self.cmd stdIn = cmd.find('%INDOC%') == -1 stdOut = cmd.find('%OUTDOC%') == -1 if not stdIn: if doc.mimeType or doc.filename: # guess our extn~n try: suff = mimetypes.guess_extension(doc.mimeType) except: suff = '' if not suff: suff = mimetypes.guess_extension(doc.filename) if suff: (qq, infn) = tempfile.mkstemp(suff) else: (qq, infn) = tempfile.mkstemp() else: (qq, infn) = tempfile.mkstemp() os.close(qq) fh = file(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) if not stdOut: if self.outMimeType: # guess our extn~n suff = mimetypes.guess_extension(self.outMimeType) (qq, outfn) = tempfile.mkstemp(suff) else: (qq, outfn) = tempfile.mkstemp() cmd = cmd.replace("%OUTDOC%", outfn) os.close(qq) if self.working: old = os.getcwd() os.chdir(self.working) else: old = '' if stdIn: pipe = subprocess.Popen(cmd, bufsize=0, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pipe.stdin.write(doc.get_raw(session)) pipe.stdin.close() result = pipe.stdout.read() pipe.stdout.close() pipe.stderr.close() del pipe else: # result will read stdout+err regardless result = getShellResult(cmd) os.remove(infn) if not stdOut: if os.path.exists(outfn) and os.path.getsize(outfn) > 0: ofh = open(outfn) else: # command probably added something to the end # annoying matches = glob.glob(outfn + "*") for m in matches: if os.path.getsize(m) > 0: ofh = open(m) break result = ofh.read() ofh.close() os.remove(outfn) # strip input filename from result if present (this is a tempfile so the name is useless) if result.startswith(infn): result = re.sub('^%s\s*[:-]?\s*' % (infn), '', result) if old: os.chdir(old) try: doc.metadata[self.metadataType].update(self._processResult(session, result)) except: doc.metadata[self.metadataType] = self._processResult(session, result) if 'analysisDateTime' not in doc.metadata[self.metadataType]: doc.metadata[self.metadataType]['analysisDateTime'] = time.strftime('%Y-%m-%dT%H:%M:%S%Z') return doc
def process_document(self, session, doc): cmd = self.cmd stdIn = cmd.find('%INDOC%') == -1 stdOut = cmd.find('%OUTDOC%') == -1 if not stdIn: # Create temp file for incoming data if doc.mimeType or doc.filename: # Guess our extn~n try: suff = mimetypes.guess_extension(doc.mimeType) except: suff = '' if not suff: suff = mimetypes.guess_extension(doc.filename) if not suff: (foofn, suff) = os.path.splitext(doc.filename) if suff: (qq, infn) = tempfile.mkstemp(suff) else: (qq, infn) = tempfile.mkstemp() else: (qq, infn) = tempfile.mkstemp() os.close(qq) fh = open(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) if not stdOut: # Create temp file to outgoing data if self.outMimeType: # Guess our extn~n suff = mimetypes.guess_extension(self.outMimeType) (qq, outfn) = tempfile.mkstemp(suff) else: (qq, outfn) = tempfile.mkstemp() cmd = cmd.replace("%OUTDOC%", outfn) os.close(qq) if self.working: old = os.getcwd() os.chdir(self.working) else: old = '' if stdIn: pipe = subprocess.Popen(cmd, bufsize=0, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pipe.stdin.write(doc.get_raw(session)) pipe.stdin.close() result = pipe.stdout.read() pipe.stdout.close() pipe.stderr.close() del pipe else: # Result will read stdout+err regardless result = getShellResult(cmd) os.remove(infn) if not stdOut: if os.path.exists(outfn) and os.path.getsize(outfn) > 0: fh = open(outfn) else: # Command probably appended something to the filename # Annoying! Have to glob for it matches = glob.glob(outfn + "*") # Or maybe ignored absolute path and put it in pwd... matches2 = glob.glob(os.path.split(outfn)[-1] + '*') for m in matches + matches2: if os.path.getsize(m) > 0: fh = open(m) break try: try: result = fh.read() except: msg = '{0}: {1}'.format(cmd, result) raise ExternalSystemException(msg) else: fh.close() finally: os.remove(outfn) try: # Clean up when data written elsewhere os.remove(fh.name) except OSError: pass if old: os.chdir(old) mt = self.outMimeType if not mt: mt = doc.mimeType return StringDocument(result, self.id, doc.processHistory, mimeType=mt, parent=doc.parent, filename=doc.filename)
def find_documents(self, session, cache=0): if cache == 1: # Can't store offsets as there's no file to offset to. raise NotImplementedError data = self.streamLocation sortx = self.factory.get_path(session, 'sortPath', None) if sortx == None: sortx = getShellResult('which sort') sorted = data + "_SORT" os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted) # Now construct cluster documents. doc = ["<cluster>"] f = file(sorted) l = f.readline() # term docid recstore occs (line, posn)* currKey = "" while (l): docdata = {} ldata = l.split('\x00') key = ldata[0] if (not key): # Data from records with no key l = f.readline() l = l[:-1] continue doc.append("<key>%s</key>\n" % (key)) ldata = ldata[1:-1] for bit in range(len(ldata) / 2): d = docdata.get(ldata[bit * 2], []) d.append(ldata[bit * 2 + 1]) docdata[ldata[bit * 2]] = d l = f.readline() l = l[:-1] ldata2 = l.split('\x00') key2 = ldata2[0] while key == key2: ldata2 = ldata2[1:-1] for bit in range(len(ldata2) / 2): d = docdata.get(ldata2[bit * 2], []) d.append(ldata2[bit * 2 + 1]) docdata[ldata2[bit * 2]] = d l = f.readline() l = l[:-1] ldata2 = l.split('\x00') key2 = ldata2[0] for k in docdata.keys(): doc.append("<%s>" % (k)) for i in docdata[k]: doc.append("%s" % i) doc.append("</%s>" % (k)) doc.append("</cluster>") sdoc = StringDocument(" ".join(doc)) if cache == 0: yield sdoc else: self.documents.append(sdoc) doc = ["<cluster>"] l = f.readline() l = l[:-1] f.close()
def create_defaultConfig(identifier, args): """Create and return a generic database configuration. identifier := string args := argparse.Namespace """ defaultPath = args.directory config = CONF.config( {'id': identifier, 'type': 'database'}, CONF.objectType("cheshire3.database.SimpleDatabase"), # <paths> CONF.paths( CONF.path({'type': "defaultPath"}, os.path.abspath(defaultPath)), # subsequent paths may be relative to defaultPath CONF.path({'type': "metadataPath"}, os.path.join('.cheshire3', 'stores', 'metadata.bdb') ), CONF.object({'type': "recordStore", 'ref': "recordStore"} ), CONF.object({'type': "protocolMap", 'ref': "cqlProtocolMap"} ), CONF.path({'type': "indexStoreList"}, "indexStore"), ), CONF.subConfigs( # recordStore CONF.subConfig( {'type': "recordStore", 'id': "recordStore"}, CONF.objectType("cheshire3.recordStore.BdbRecordStore"), CONF.paths( CONF.path({'type': "defaultPath"}, os.path.join('.cheshire3', 'stores') ), CONF.path({'type': "databasePath"}, 'recordStore.bdb' ), CONF.object({'type': "idNormalizer", 'ref': "StringIntNormalizer"} ), CONF.object({'type': "inWorkflow", 'ref': "XmlToLZ4Workflow"} ), CONF.object({'type': "outWorkflow", 'ref': "LZ4ToLxmlWorkflow"} ), ), CONF.options( CONF.setting({'type': "digest"}, 'md5'), ), ), # indexStore CONF.subConfig( {'type': "indexStore", 'id': "indexStore"}, CONF.objectType("cheshire3.indexStore.BdbIndexStore"), CONF.paths( CONF.path({'type': "defaultPath"}, os.path.join('.cheshire3', 'indexes') ), CONF.path({'type': "tempPath"}, 'temp' ), CONF.path({'type': "recordStoreHash"}, 'recordStore' ), ) ), # protocolMap CONF.subConfig( {'type': "protocolMap", 'id': "cqlProtocolMap"}, CONF.objectType("cheshire3.protocolMap.CQLProtocolMap"), CONF.paths( CONF.path({'type': "zeerexPath"}, args.zeerexPath) ), ), # MagicRedirectPreParser # Over-ride default behavior to preParse generic file types to METS # so that it can be parsed and indexed as XML CONF.subConfig( {'type': "preParser", 'id': "MagicRedirectPreParser"}, CONF.objectType("cheshire3.preParser.MagicRedirectPreParser"), CONF.hash( CONF.object({'mimeType': "application/pdf", 'ref': "PdfToMetsPreParserWorkflow"} ), CONF.object({'mimeType': "text/prs.fallenstein.rst", 'ref': "ReSTToMetsPreParserWorkflow"} ), CONF.object({'mimeType': "text/plain", 'ref': "TxtToMetsPreParserWorkflow"} ), CONF.object({'mimeType': "text/html", 'ref': "HtmlToMetsPreParserWorkflow"} ), CONF.object({'mimeType': "*", 'ref': "METSWrappingPreParser"} ), ), ), ), ) # Check sortPath and fix up if necessary serverSortPath = server.get_path(session, 'sortPath') if not os.path.exists(serverSortPath): # Attempt to fix locally for default IndexStore sortPath = getShellResult('which sort') if 'which: no sort in' not in sortPath: # Found a sort executable - can add to configuration storePathsNode = config.xpath( '//c3:subConfig[@id="indexStore"]/c3:paths', namespaces={'c3': CONFIG_NS} )[0] storePathsNode.append( CONF.path({'type': "sortPath"}, sortPath) ) # Add database docs if provided if args.title and args.description: config.insert(0, CONF.docs("{0.title} - {0.description}".format(args))) elif args.title: config.insert(0, CONF.docs(args.title)) elif args.description: config.insert(0, CONF.docs(args.description)) return config
def process_document(self, session, doc): cmd = self.cmd stdIn = cmd.find('%INDOC%') == -1 stdOut = cmd.find('%OUTDOC%') == -1 if not stdIn: if doc.mimeType or doc.filename: # guess our extn~n try: suff = mimetypes.guess_extension(doc.mimeType) except: suff = '' if not suff: suff = mimetypes.guess_extension(doc.filename) if not suff: (foofn, suff) = os.path.splitext(doc.filename) if suff: (qq, infn) = tempfile.mkstemp(suff) else: (qq, infn) = tempfile.mkstemp() else: (qq, infn) = tempfile.mkstemp() os.close(qq) fh = open(infn, 'w') fh.write(doc.get_raw(session)) fh.close() cmd = cmd.replace("%INDOC%", infn) if not stdOut: if self.outMimeType: # guess our extn~n suff = mimetypes.guess_extension(self.outMimeType) (qq, outfn) = tempfile.mkstemp(suff) else: (qq, outfn) = tempfile.mkstemp() cmd = cmd.replace("%OUTDOC%", outfn) os.close(qq) if self.working: old = os.getcwd() os.chdir(self.working) else: old = '' if stdIn: pipe = subprocess.Popen(cmd, bufsize=0, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) pipe.stdin.write(doc.get_raw(session)) pipe.stdin.close() result = pipe.stdout.read() pipe.stdout.close() pipe.stderr.close() del pipe else: # result will read stdout+err regardless result = getShellResult(cmd) os.remove(infn) if not stdOut: if os.path.exists(outfn) and os.path.getsize(outfn) > 0: fh = open(outfn) else: # command probably added something to the end # annoying matches = glob.glob(outfn + "*") # or maybe ignored absolute path and put it in pwd... matches2 = glob.glob(os.path.split(outfn)[-1] + '*') for m in matches + matches2: if os.path.getsize(m) > 0: fh = open(m) break try: try: result = fh.read() except: raise ExternalSystemException('Error from command: {0} : {1}'.format(cmd, result)) else: fh.close() finally: os.remove(outfn) try: os.remove(fh.name) # clean up when data was written somewhere other than outfn except OSError: pass if old: os.chdir(old) mt = self.outMimeType if not mt: mt = doc.mimeType return StringDocument(result, self.id, doc.processHistory, mimeType=mt, parent=doc.parent, filename=doc.filename)
def create_defaultConfig(identifier, args): """Create and return a generic database configuration. identifier := string args := argparse.Namespace """ defaultPath = args.directory config = CONF.config( { 'id': identifier, 'type': 'database' }, CONF.objectType("cheshire3.database.SimpleDatabase"), # <paths> CONF.paths( CONF.path({'type': "defaultPath"}, os.path.abspath(defaultPath)), # subsequent paths may be relative to defaultPath CONF.path({'type': "metadataPath"}, os.path.join('.cheshire3', 'stores', 'metadata.bdb')), CONF.object({ 'type': "recordStore", 'ref': "recordStore" }), CONF.object({ 'type': "protocolMap", 'ref': "cqlProtocolMap" }), CONF.path({'type': "indexStoreList"}, "indexStore"), ), CONF.subConfigs( # recordStore CONF.subConfig( { 'type': "recordStore", 'id': "recordStore" }, CONF.objectType("cheshire3.recordStore.BdbRecordStore"), CONF.paths( CONF.path({'type': "defaultPath"}, os.path.join('.cheshire3', 'stores')), CONF.path({'type': "databasePath"}, 'recordStore.bdb'), CONF.object({ 'type': "idNormalizer", 'ref': "StringIntNormalizer" }), CONF.object({ 'type': "inWorkflow", 'ref': "XmlToLZ4Workflow" }), CONF.object({ 'type': "outWorkflow", 'ref': "LZ4ToLxmlWorkflow" }), ), CONF.options(CONF.setting({'type': "digest"}, 'md5'), ), ), # indexStore CONF.subConfig({ 'type': "indexStore", 'id': "indexStore" }, CONF.objectType("cheshire3.indexStore.BdbIndexStore"), CONF.paths( CONF.path({'type': "defaultPath"}, os.path.join('.cheshire3', 'indexes')), CONF.path({'type': "tempPath"}, 'temp'), CONF.path({'type': "recordStoreHash"}, 'recordStore'), )), # protocolMap CONF.subConfig( { 'type': "protocolMap", 'id': "cqlProtocolMap" }, CONF.objectType("cheshire3.protocolMap.CQLProtocolMap"), CONF.paths(CONF.path({'type': "zeerexPath"}, args.zeerexPath)), ), # MagicRedirectPreParser # Over-ride default behavior to preParse generic file types to METS # so that it can be parsed and indexed as XML CONF.subConfig( { 'type': "preParser", 'id': "MagicRedirectPreParser" }, CONF.objectType("cheshire3.preParser.MagicRedirectPreParser"), CONF.hash( CONF.object({ 'mimeType': "application/pdf", 'ref': "PdfToMetsPreParserWorkflow" }), CONF.object({ 'mimeType': "text/prs.fallenstein.rst", 'ref': "ReSTToMetsPreParserWorkflow" }), CONF.object({ 'mimeType': "text/plain", 'ref': "TxtToMetsPreParserWorkflow" }), CONF.object({ 'mimeType': "text/html", 'ref': "HtmlToMetsPreParserWorkflow" }), CONF.object({ 'mimeType': "*", 'ref': "METSWrappingPreParser" }), ), ), ), ) # Check sortPath and fix up if necessary serverSortPath = server.get_path(session, 'sortPath') if not os.path.exists(serverSortPath): # Attempt to fix locally for default IndexStore sortPath = getShellResult('which sort') if 'which: no sort in' not in sortPath: # Found a sort executable - can add to configuration storePathsNode = config.xpath( '//c3:subConfig[@id="indexStore"]/c3:paths', namespaces={'c3': CONFIG_NS})[0] storePathsNode.append(CONF.path({'type': "sortPath"}, sortPath)) # Add database docs if provided if args.title and args.description: config.insert(0, CONF.docs("{0.title} - {0.description}".format(args))) elif args.title: config.insert(0, CONF.docs(args.title)) elif args.description: config.insert(0, CONF.docs(args.description)) return config
def find_documents(self, session, cache=0): if cache == 1: # Can't store offsets as there's no file to offset to. raise NotImplementedError data = self.streamLocation sortx = self.factory.get_path(session, 'sortPath', None) if sortx == None: sortx = getShellResult('which sort') sorted = data + "_SORT" os.spawnl(os.P_WAIT, sortx, sortx, data, '-o', sorted) # Now construct cluster documents. doc = ["<cluster>"] f = file(sorted) l = f.readline() # term docid recstore occs (line, posn)* currKey = "" while(l): docdata = {} ldata = l.split('\x00') key = ldata[0] if (not key): # Data from records with no key l = f.readline() l = l[:-1] continue doc.append("<key>%s</key>\n" % (key)) ldata = ldata[1:-1] for bit in range(len(ldata)/2): d = docdata.get(ldata[bit*2], []) d.append(ldata[bit*2+1]) docdata[ldata[bit*2]] = d l = f.readline() l = l[:-1] ldata2 = l.split('\x00') key2 = ldata2[0] while key == key2: ldata2 = ldata2[1:-1] for bit in range(len(ldata2)/2): d = docdata.get(ldata2[bit*2], []) d.append(ldata2[bit*2+1]) docdata[ldata2[bit*2]] = d l = f.readline() l = l[:-1] ldata2 = l.split('\x00') key2 = ldata2[0] for k in docdata.keys(): doc.append("<%s>" % (k)) for i in docdata[k]: doc.append("%s" % i) doc.append("</%s>" % (k)) doc.append("</cluster>") sdoc = StringDocument(" ".join(doc)) if cache == 0: yield sdoc else: self.documents.append(sdoc) doc = ["<cluster>"] l = f.readline() l = l[:-1] f.close()