def evalRelationTypes(fname, fGoldStd, methodLabel, method ): """ evaluates the given ontology and writes the results into a file @param[in] fname file name of the ontology to evaluate @param[in] fGoldStd file name of the gold standard ontology @param[in] methodLabel label of the method used in the evaluation @param[in] method method used in the evaluator """ goldStd = _readOntology( fGoldStd ) ontology = _readOntology( fname ) goldStdConcepts = set(map(str, extractRelationSet(goldStd))) ontologyConcepts = set(map(str, extractRelationSet(ontology))) log.info("Comparing the relation set %s to the gold standard %s." % (ontologyConcepts, goldStdConcepts)) res = [ 1 ] for scoringMethod in (EqualRel, EqualGroup, SimilarGroup): __cache__ = DiskCache(".diskCache-%s-%s" % (scoringMethod.__name__, os.path.basename(fGoldStd)) ) c = ConceptScoring(ontologyConcepts, goldStdConcepts, scoringMethod, '|') key = "%s, %s |" % (ontologyConcepts, goldStdConcepts) score = __cache__.fetchObjectId(key, c.score) res.append(score) # compute precision and recall p = float(score) / len(ontologyConcepts) r = float(score) / len(goldStdConcepts) if p==0. and r== 0.: res.append(0.) else: res.append( metrics.fMeasure(p,r) ) #print ">>>", len(goldStdConcepts), len(ontologyConcepts), "***", res return res
def testDirectCall(self): ''' tests directly calling the cache object using __call__ ''' CACHE_DIR = get_cache_dir(4) cached_str = DiskCache(CACHE_DIR, fn=str) assert cached_str(7) == "7" assert cached_str.getKey(7) in cached_str
def __init__(self, dataSource, cache=True): """ @param[in] dataSource implementing the TagInfoService Interface """ assert isinstance(dataSource, TagInfoService) self.dataSource = dataSource if cache == True: diskCache = DiskCache("./.coherence-tagcount-cache", 2) self.getTagCount = lambda tt: diskCache.fetchObjectId(self.dataSource.__class__.__name__ + str(tt), self.dataSource.getTagInfo, tt) else: self.getTagCount = self.dataSource.getTagInfo
def testObjectKeyGeneration(self): ''' ensures that the diskcache object's location does not change ''' CACHE_DIR = get_cache_dir(3) d = DiskCache(CACHE_DIR) getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x)) d.fetchObjectId(1, str, 1) assert exists( getCacheLocation(1) ) d.fetch(str, 2) assert exists( getCacheLocation( ((2,), ()) ))
def getPostHashfile(self, cmd): ''' returns an identifier representing the object which is compatible to the identifiers returned by the eWRT.util.cache.* classes. ''' args = ( tuple(cmd[1:]), () ) # required to produce the same hash as DiskCache's fetch method return self._get_fname(DiskCache.getObjectId(args))
class WebDocumentTerm(TermReference): """ @class WebDocumentTerm Similarity metric based on the similarity of the documents retrieved with a web search. """ yahoo = Yahoo() __cache__ = DiskCache(".diskCache-WebDocumentTerm-conceptCache", cache_nesting_level=2) @staticmethod def _getConceptWebDocuments(concept): """ returns web documents describing the given concept @param[in] concept concept used to describe the text """ searchTerms = (concept.name, ) + tuple( concept.context_terms)[:CONTEXT_TERM_COUNT] log.debug("Searching for %s" % str(searchTerms)) yq = Yahoo.getSearchResults( \ WebDocumentTerm.yahoo.query( searchTerms, \ count=WEB_DOCUMENT_COUNT, \ queryParams={'view':'keyterms', 'abstract': 'long', 'type':'html,text'}) ) p = Pool(WEB_DOCUMENT_COUNT) text = "\n".join(p.map(p_getWebDocumentText, yq)) return cleanup(text) @staticmethod def _getConceptWebDocumentsVector(concept): return VectorSpaceModel( WebDocumentTerm._getConceptWebDocuments(concept).split()) @staticmethod @DiskCached(".diskCache-WebDocumentTerm-or") def _or(c1, c2): """ Compares two concepts and returns their similarity score @param[in] c1 the first OntologyConcept @param[in] c2 the second OntologyConcept @returns the similarity betwen c1 and c2 """ c1Text = WebDocumentTerm.__cache__.fetchObjectId( c1, WebDocumentTerm._getConceptWebDocumentsVector, c1) c2Text = WebDocumentTerm.__cache__.fetchObjectId( c2, WebDocumentTerm._getConceptWebDocumentsVector, c2) # similarity for concepts with no matches if len(c1Text.v) == 0 or len(c2Text.v) == 0: if len(c1Text.v) == 0: log.warn("No web pages found for '%s'" % c1) if len(c2Text.v) == 0: log.warn("No web pages found for '%s'" % c2) return 0. return c1Text * c2Text def __or__(self, o): return self._or(self.e, o.e)
def evalOntology( fname, fGoldStd, methodLabel, method ): """ evaluates the given ontology and writes the results into a file @param[in] fname file name of the ontology to evaluate @param[in] fGoldStd file name of the gold standard ontology @param[in] methodLabel label of the method used in the evaluation @param[in] method method used in the evaluator """ goldStd = _readOntology( fGoldStd ) ontology = _readOntology( fname ) goldStdConcepts = OntologyConcept.sequenceToOntologyConceptList(extractConceptSet(goldStd)) ontologyConcepts = OntologyConcept.sequenceToOntologyConceptList(extractConceptSet(ontology)) log.info("Comparing the ontology concepts %s to the gold standard %s." % (ontologyConcepts, goldStdConcepts)) res = [ conceptTermCount( ontology ) ] for scoringMethod in (EqualTerm, StringEditTerm, PhoneticTerm, WordNetTerm, WikipediaTerm, WebDocumentTerm, GoogleDistanceTerm, OntologyTerm, ): __cache__ = DiskCache(".diskCache-%s-%s" % (scoringMethod.__name__, os.path.basename(fGoldStd)) ) # Methods using neighbor concepts if scoringMethod in (WebDocumentTerm, ): goldNeighborConcepts = OntologyConcept.statementsToDirectNeighborOntologyConceptList( extractSPO(goldStd) ) ontoNeighborConcepts = OntologyConcept.statementsToDirectNeighborOntologyConceptList( extractSPO(ontology) ) c = ConceptScoring(ontoNeighborConcepts, goldNeighborConcepts, scoringMethod, '|', poolSize=1) key = "%s, %s |" % (ontoNeighborConcepts, goldNeighborConcepts) # methods using all concepts else: ps = 1 if scoringMethod == OntologyTerm else 4 c = ConceptScoring(ontologyConcepts, goldStdConcepts, scoringMethod, '|', poolSize=ps) key = "%s, %s |" % (ontologyConcepts, goldStdConcepts) score = __cache__.fetchObjectId(key, c.score) print scoringMethod, score res.append(score) # compute precision and recall p = float(score) / len(ontologyConcepts) r = float(score) / len(goldStdConcepts) if p==0. and r== 0.: res.append(0.) else: res.append( metrics.fMeasure(p,r) ) return res
def __init__(self, submitter, api_key=OPENCALAIS_KEY, allow_distro="false", allow_search="false", cache_dir=OPENCALAIS_CACHE_DIR): """ Creates a new handler for communicating with OpenCalais. The parameter 'submitter' must contain a string, identifying your application. 'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register). The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions. The default value for 'allow_distro' is 'false'. The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata. The default value for 'allow_search' is 'false'. """ assert(api_key) self.submitter = submitter self.allow_distro = "false" self.allow_search = "false" self.api_key = api_key if cache_dir: self.cache = DiskCache(cache_dir, cache_nesting_level=2, cache_file_suffix=".xml")
class Calais: submitter = USER_AGENT % "Calais" allow_distro = "false" allow_search = "false" api_key = "" def __init__(self, submitter, api_key=OPENCALAIS_KEY, allow_distro="false", allow_search="false", cache_dir=OPENCALAIS_CACHE_DIR): """ Creates a new handler for communicating with OpenCalais. The parameter 'submitter' must contain a string, identifying your application. 'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register). The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions. The default value for 'allow_distro' is 'false'. The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata. The default value for 'allow_search' is 'false'. """ assert(api_key) self.submitter = submitter self.allow_distro = "false" self.allow_search = "false" self.api_key = api_key if cache_dir: self.cache = DiskCache(cache_dir, cache_nesting_level=2, cache_file_suffix=".xml") @staticmethod def random_id(self): """ Creates a random 10-character ID for your submission. """ chars = str.letters + str.digits return "".join( [ choice(chars) for i in xrange(10) ] ) @staticmethod def content_id(text): """ Creates a SHA1 hash of the text of your submission. """ try: import hashlib h = hashlib.sha1() except ImportError: import sha h = sha.new() h.update(text) return h.hexdigest() def analyze(self, text, content_type="text/txt"): """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. Set the content-type to 'text/html' if you are submitting HTML data. """ externalID = self.content_id( text ) paramsXML = PARAMS_XML % (content_type, self.allow_distro, self.allow_search, externalID, self.submitter) param = urlencode({'licenseID':self.api_key, 'content':text, 'paramsXML':paramsXML}) # do not fetch the data again, if a file exists in the cache get_calais_data = lambda x: Retrieve(Calais.__name__).open(OPENCALAIS_URL, x).read() if self.cache is None: xml_data = self.unpack( get_calais_data( param ) ) else: xml_data = self.unpack( self.cache.fetch( get_calais_data, param ) ) return self.parse( xml_data ) @staticmethod def unpack(calais_data): """ extracts calais' xml response from the data send by the calais webservice """ dom = minidom.parseString(calais_data) return """<?xml version="1.0" encoding="utf-8"?>\n""" \ + dom.getElementsByTagName("string")[0].firstChild.data @staticmethod def cleanup_xml(xml_data): """ removes comments from xml-data-streams provided by opencalais @param[in] xml_data @returns the xml data without any comments """ result = [] comment = False while '<!--' in xml_data: xml_data = re.sub('<!--[\s\S]*?-->', '', xml_data) if not re.search('<!--', xml_data): break return xml_data @staticmethod def parse(xml_data): """ parses opencalai's xml output and returns it's dictionary representation """ things = [] xml_data = Calais.cleanup_xml(xml_data) # f= open("tmp","w"); f.write(xml_data.encode("utf8")); f.close() dom = minidom.parseString( xml_data.encode("utf8" )) for document in dom.getElementsByTagName("CalaisSimpleOutputFormat"): for annotations in document.childNodes: if not annotations.hasChildNodes(): continue if annotations.nodeName == 'Topics': annotations = annotations.firstChild nodeName = annotations.nodeName nodeAttr = dict(annotations.attributes.items()) nodeAttr.update( {'data': annotations.firstChild.data } ) things.append( {nodeName: nodeAttr } ) return things
def getPostHashfile(self, cmd): ''' returns an identifier representing the object which is compatible to the identifiers returned by the eWRT.util.cache.* classes. ''' args = (tuple( cmd[1:]), ()) # required to produce the same hash as DiskCache's fetch method return self._get_fname(DiskCache.getObjectId(args))
def setUp(self): self.diskCache = DiskCache(get_cache_dir(4))
class SkipTestDiskCached(TestCached): @staticmethod @DiskCached(get_cache_dir(1)) def add(a=1, b=2): return a+b @staticmethod @DiskCached(get_cache_dir(2)) def sub(a, b): return a-b def setUp(self): self.diskCache = DiskCache(get_cache_dir(4)) def tearDown(self): ''' remove the cache directories ''' for cacheDirNo in range(10): if exists(get_cache_dir(cacheDirNo)): rmtree(get_cache_dir(cacheDirNo)) def testObjectKeyGeneration(self): ''' ensures that the diskcache object's location does not change ''' CACHE_DIR = get_cache_dir(3) d = DiskCache(CACHE_DIR) getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x)) d.fetchObjectId(1, str, 1) assert exists( getCacheLocation(1) ) d.fetch(str, 2) assert exists( getCacheLocation( ((2,), ()) )) def testContains(self): ''' verifies that 'key' in cache works ''' # diskcache assert self.diskCache.fetchObjectId(1, str, 1 ) == "1" assert 1 in self.diskCache assert 2 not in self.diskCache # diskcached assert self.add(12,14) == 26 assert self.add.getKey(12,14) in self.add assert 9 not in self.add def testDelItem(self): ''' verifies that delitem works ''' # diskcache assert self.diskCache.fetch(str, 2) == "2" key = self.diskCache.getKey(2) assert key in self.diskCache del self.diskCache[key] assert key not in self.diskCache # diskcached assert self.add(12,13) == 25 key = self.add.getKey(12, 13) assert key == ((12, 13), ()) assert key in self.add del self.add[key] assert key not in self.add def testDirectCall(self): ''' tests directly calling the cache object using __call__ ''' CACHE_DIR = get_cache_dir(4) cached_str = DiskCache(CACHE_DIR, fn=str) assert cached_str(7) == "7" assert cached_str.getKey(7) in cached_str def testIterableCache(self): ''' tests the iterable cache ''' CACHE_DIR = get_cache_dir(5) i = IterableCache(CACHE_DIR) getTestIterator = lambda x: range(x) for iteratorSize in (4, 5, 6): cachedIterator = i.fetch( getTestIterator, iteratorSize ) for x,y in zip(cachedIterator, getTestIterator(iteratorSize)): assert x == y @pytest.mark.slow def testThreadSafety(self): ''' tests whether everything is thread safe ''' for a in range(1000): c = DiskCache(get_cache_dir(6)) p = Pool(12) p.map(f, 60*[c] ) p.map(g, 60*[c] ) p.close() p.join()
def __init__(self, e): """ @param[in] lm A list of TermReference metrics to use in the ontology metric """ TermReference.__init__(self, e) self.metrics = dict([(m(e), DiskCache(".diskCache-single-%s" % m.__name__)) for m in self.METRICS])