    def testObjectKeyGeneration(self):
        ''' ensures that the diskcache object's location does not change '''
        CACHE_DIR = get_cache_dir(3)
        d = DiskCache(CACHE_DIR)
        getCacheLocation = lambda x: join(CACHE_DIR, Cache.getObjectId(x))

        d.fetchObjectId(1, str, 1)
        assert exists(getCacheLocation(1))

        d.fetch(str, 2)
        assert exists(getCacheLocation(((2, ), ())))
class Calais(object):
    submitter = USER_AGENT % "Calais"
    allow_distro = "false"
    allow_search = "false"
    api_key = ""

    def __init__(self,
        Creates a new handler for communicating with OpenCalais.  
                The parameter 'submitter' must contain a string, identifying your application.  
                'api_key' must contain a string with your OpenCalais API key (get it here: http://developer.opencalais.com/apps/register).  
        The optional parameter 'allow_distro', if set to 'true' gives OpenCalais permission to distribute the metadata extracted from your submissions.  The default value for 'allow_distro' is 'false'.  
        The optional parameter 'allow_search', if set to 'true' tells OpenCalais that future searches can be performed on the extracted metadata.  The default value for 'allow_search' is 'false'.  
        assert (api_key)
        self.submitter = submitter
        self.allow_distro = "false"
        self.allow_search = "false"
        self.api_key = api_key
        if cache_dir:
            self.cache = DiskCache(cache_dir,

    def random_id(self):
        Creates a random 10-character ID for your submission.  
        chars = str.letters + str.digits
        return "".join([choice(chars) for i in xrange(10)])

    def content_id(text):
        Creates a SHA1 hash of the text of your submission.  
            import hashlib
            h = hashlib.sha1()
        except ImportError:
            import sha
            h = sha.new()

        return h.hexdigest()

    def analyze(self, text, content_type="text/txt"):
        """ Submits 'text' to OpenCalais for analysis and memorizes the extracted metadata. 
            Set the content-type to 'text/html' if you are submitting HTML data.  
        externalID = self.content_id(text)
        paramsXML = PARAMS_XML % (content_type, self.allow_distro,
                                  self.allow_search, externalID,
        param = urlencode({
            'licenseID': self.api_key,
            'content': text,
            'paramsXML': paramsXML

        # do not fetch the data again, if a file exists in the cache
        get_calais_data = lambda x: Retrieve(Calais.__name__).open(
            OPENCALAIS_URL, x).read()

        if self.cache is None:
            xml_data = self.unpack(get_calais_data(param))
            xml_data = self.unpack(self.cache.fetch(get_calais_data, param))

        return self.parse(xml_data)

    def unpack(calais_data):
        """ extracts calais' xml response from the data send by the calais 
        dom = minidom.parseString(calais_data)
        return """<?xml version="1.0" encoding="utf-8"?>\n""" \
                 + dom.getElementsByTagName("string")[0].firstChild.data

    def cleanup_xml(xml_data):
        """ removes comments from xml-data-streams provided by opencalais
            @param[in] xml_data 
            @returns the xml data without any comments
        result = []
        comment = False

        while '<!--' in xml_data:

            xml_data = re.sub('<!--[\s\S]*?-->', '', xml_data)
            if not re.search('<!--', xml_data):

        return xml_data

    def parse(xml_data):
        """ parses opencalai's xml output and returns it's dictionary representation """

        things = []

        xml_data = Calais.cleanup_xml(xml_data)

        # f= open("tmp","w"); f.write(xml_data.encode("utf8")); f.close()
        dom = minidom.parseString(xml_data.encode("utf8"))

        for document in dom.getElementsByTagName("CalaisSimpleOutputFormat"):
            for annotations in document.childNodes:
                if not annotations.hasChildNodes():

                if annotations.nodeName == 'Topics':
                    annotations = annotations.firstChild

                nodeName = annotations.nodeName
                nodeAttr = dict(annotations.attributes.items())

                nodeAttr.update({'data': annotations.firstChild.data})

                things.append({nodeName: nodeAttr})

        return things
文件: __init__.py 项目: k3njiy/ewrt
