Python Calais示例，calais.Calais Python示例

示例#1

0

显示文件

def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects=[]
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id = uid)
        except:
            return []
        #if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr( result, 'socialTag'):
            for tag in result.socialTag:
                if float(tag['importance']) > relevance:
                    subjects.append(tag['name'])
        #if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects

示例#2

0

显示文件

文件： usermine.py 项目： HitherTech/usermine

def populate_database_with_semantic_data_from_comments(calais_api_key, db_cursor, debug):

	calais = Calais(calais_api_key, submitter='usermine')

	db_cursor.execute('SELECT id, comment FROM comments WHERE updated = 0')

	for comment_data in db_cursor.fetchall():

		id = comment_data[0]
		comment = comment_data[1]

		try:
			result = calais.analyze(comment)

			if hasattr(result, 'entities'):
				for entity in result.entities:
					entity_name = entity['name']
					db_cursor.execute('INSERT INTO entities (id, entity) VALUES (NULL, ?)', [entity_name])

			if hasattr(result, 'topics'):
				for topic in result.topics:
					topic_name = topic['categoryName']
					db_cursor.execute('INSERT INTO topics (id, topic) VALUES (NULL, ?)', [topic_name])
		except:
			if debug:
				print sys.exc_info()

		db_cursor.execute('UPDATE comments SET updated=1 WHERE id = ?', [id])

		if debug:
			print '.'

示例#3

0

显示文件

文件： processors.py 项目： carriercomm/spyda

    def process_calais(content, key):
        calais = Calais(key)
        response = calais.analyze(content)

        people = [entity["name"] for entity in getattr(response, "entities", []) if entity["_type"] == "Person"]

        return {"people": people}

示例#4

0

显示文件

文件： utilities.py 项目： jean/collective.taghelper

def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects = []
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id=uid)
        except:
            return []
        # if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr(result, "socialTag"):
            for tag in result.socialTag:
                if float(tag["importance"]) > relevance:
                    subjects.append(tag["name"])
        # if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects

示例#5

0

显示文件

文件： Stage_2_OpenCalais.py 项目： metaswirl/powernet

def getOpenCalaisResultFromURL(url):
    API_KEY = openCalaisKeys[randint(0,6)]
    calais = Calais(API_KEY, submitter="python-calais demo")
    try:
        result = calais.analyze_url(url)
    except:
        result = None
    return result

示例#6

0

显示文件

文件： views.py 项目： rainzoo/jigsaw

def analyze(request):
    API_KEY = 'kgmykdr862hdfhkzkuchnxkc'
    calais = Calais(API_KEY, 'python')
    try:
        result = calais.analyze(request.POST['content'])
        if result:
            return HttpResponse(json.dumps(result.get_json_entities()),
                                mimetype="application/json")
        else:
            return HttpResponse(json.dumps(""),
                                mimetype="application/json")
    except(KeyError):
        return HttpResponse(json.dumps(""),
                                mimetype="application/json")

示例#7

0

显示文件

文件： calaisify.py 项目： adragomir/wurbe22

def calaisify_main(options, args):
    m = persistence.MongoPersistence()
    calais = Calais(config.calais["api_key"], submitter="python-calais demo")
    for d in m.get_non_calaised_entries():
        l = short_url.has(d["title"])
        if len(l):
            for url in l:
                long_url = short_url.find_real(url)
                response = calais.analyze_url(long_url)
                response.print_summary()
                response.print_topics()
                response.print_entities()

        else:
            pass

示例#8

0

显示文件

文件： views.py 项目： hampelm/detroit-council

def contract(request, contract):
    response = {}
    collection = connection()
    blocks = collection.find( { 'contracts':contract}, sort=[('date', ASCENDING)] )
    
    # get last block and display if it was approved or not
   
    calais = Calais(settings.CALAIS, submitter="hello world")
    
    newbloks = []
    for block in blocks:
        block['entities'] = calais.analyze(block['string']).entities
        newbloks.append(block)
        
    response['items'] = newbloks
    response['contract'] = contract
    return render_to_response('contract.html', response)

示例#9

0

显示文件

文件： models.py 项目： markng/newscredit_api

def analyze(model, text=None, backend='calais'):
    from calais import Calais
    from django.conf import settings

    calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
    if not text:
        _text = model.analysis_text()
    else:
        _text = text

    # we cannot analyse if our total content is under 100 characters
    # after HTML cleaning. We leave OpenCalais to do this as they have
    # advanced heuristics to do it. If our text to analyse is less than
    # 100 characters, we skip the analysis.
    if len( _text ) < 100:
        return

    result = calais.analyze(_text)
    records = []

    try:
        for entity in result.entities:
            if entity['_type'] == 'Person':
                _model = Person
            elif entity['_type'] in [ 'City', 'Country', 'Continent',
                                        'Facility', 'Region' ]:
                _model = Place
            elif entity['_type'] in [ 'Organization', 'Company' ]:
                _model = Organisation
            else:
                continue

            try:
                _record = _model.objects.get(uri=entity['__reference'])
            except _model.DoesNotExist, e:
                _record = _model()
            _record.from_calais(entity)
            _record.save()
            model.add_entity(_record)
            records.append(_record)
    except AttributeError:
        # this happens if Calais throws an error. To ensure we continue
        # processing other records pass this error and return False
        return False
    return result, records

示例#10

0

显示文件

文件： models.py 项目： bcampbell/newscredit_api

def analyze(model, text=None, backend='calais'):
  from calais import Calais
  from django.conf import settings
  calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
  if text:
    result = calais.analyze(text)
  else:
    result = calais.analyze(model.analysis_text())
  people = []
  for entity in result.entities:
    if entity['_type'] == 'Person':
      try:
        person = Person.objects.get(uri=entity['__reference'])
      except Person.DoesNotExist, e:
        person = Person()
      person.from_calais(entity)
      person.save()
      people.append(person)

示例#11

0

显示文件

文件： views.py 项目： dakoller/askasapquestion

def calais_test(request):
    content_url= request.REQUEST['content_url']
    if content_url == '':
        content_url='http://n-tv.de'
    calais = Calais('pc5v39x8sq3mh4mv9zm2ppre' , submitter="ask-a-sap-question")
    result= calais.analyze_url(content_url)
    result.print_summary()
    c_data= {
        "name"  :   "Barack",
    }
    for entity in result.entities:
        c_data[entity['name']]=entity['_type']
    lang= result.doc['meta']['language']
    return render_to_response('calais_test.html',{ 'text': content_url, 'calais_data': c_data, 'language': lang})

示例#12

0

显示文件

    def __init__(self):
        self.db_name = "pythia_db"

        self.connections = {
            "db":
            connect(self.db_name),
            "calais":
            Calais("av536xwvy4mgmcbw9cancqmd", submitter="pythia-application")
        }

示例#13

0

显示文件

文件： enrich.py 项目： yokkay/comp-journalism

def main():
    """ Main method """

    if not sys.stdin:
        usage()
        sys.exit(1)

    input = sys.stdin.read() # read from stdin
    input = input.replace("<", "")
    input = input.replace(">", "")

    api_key = config.read_config('calais', 'api_key')
    calais = Calais(api_key, submitter="fridakahlo")
    entities = calais.analyze(input).entities
    
    #for e in entities: print e['name'], len(e['instances']), '<br>'

    linked_text = add_links(entities, input)
    for line in linked_text.splitlines():
        print "<p>", line, "<p>"

示例#14

0

显示文件

文件： resume_extract.py 项目： quifor/resume_extractor

def entities(env, start_response):
    """Extracts entities from resume utilizing the OpenCalais webservice."""

    start_response('200 OK', [('Content-Type', 'text/xml')])
    API_KEY = "kqyhhfppufvmvxspkstwjxw5"
    calais = Calais(API_KEY, submitter="resume_analysis")
    try:
        with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
            text = f.read()
    except:
        raise restlite.Status, '400 Error Reading File'
    try:
        results = calais.analyze(text)
    except Exception as e:
        return "<error>%s</error>" % e

    entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
                       for entity in results.entities]
    doc = create_xml({'entities': entities_tuples})

    return [str(doc.toxml())]

示例#15

0

显示文件

文件： resume_extract.py 项目： dplutcho/resume_extractor

def entities(env, start_response):
	"""Extracts entities from resume utilizing the OpenCalais webservice."""

	start_response('200 OK', [('Content-Type', 'text/xml')])
	API_KEY = "kqyhhfppufvmvxspkstwjxw5"
	calais = Calais(API_KEY, submitter="resume_analysis")
	try:
		with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
			text = f.read()
	except: 
		raise restlite.Status, '400 Error Reading File'
	try:
		results = calais.analyze(text)
	except Exception as e:
		return "<error>%s</error>" % e

	entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
											  for entity in results.entities]
	doc = create_xml({'entities': entities_tuples})
	
	return [str(doc.toxml())]

示例#16

0

显示文件

文件： coref.py 项目： tonyhart759456/citizen-quotes

def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {}  # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append(
                    (i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref

示例#17

0

显示文件

文件： opencalais.py 项目： thenewsroom/syndication

    def __init__(self, content_object, content_fields=None):
        super(OpenCalais, self).__init__(content_object)
        self.calais = Calais(settings.CALAIS_API_KEY,
                             settings.CALAIS_SUBMITTER)

        if content_fields:
            self.calais_content_fields = content_fields
        else:
            try:
                self.calais_content_fields = dict(
                    self.content_object.__class__.calais_content_fields)
            except FieldDoesNotExist, e:
                raise OpenCalaisTagFetchError(
                    'You need to define calais_content_fields: %s' % e)

示例#18

0

显示文件

文件： coref.py 项目： afrigeo/citizen-quotes

def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {} # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append((i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref

示例#19

0

显示文件

文件： process.py 项目： robhemsley/Context-Controller

class classifier():
	API_KEY = 'xyby6x47ycxj56bkkb83s9he'

	def __init__(self):
		self.calais = Calais(self.API_KEY, submitter='SocialTV Demo')

	def process(self, ip, text):
		result = self.calais.analyze(text)
		all_tags = []

		if hasattr(result, 'entities'):
			all_tags.extend(result.entities)

		if hasattr(result, 'socialTag'):
	        #all_tags.extend(result.socialTag)
			pass

		if len(all_tags) == 0:
			return []

		output = []
		for tag in all_tags:
			if tag['name'].lower() not in dirtyWords.DIRTY:
				qwiki = self._get_qwiki_url(tag['name'])
				if len(qwiki) > 0:
					img = self._get_google_image(ip, tag['name'])
					print "Tag: "+tag['name']
					output.append({'text':text ,'tag': tag['name'], 'qwiki':qwiki, 'img': img})
		return output

	
	def _get_qwiki_url(self, text):
		try:
			tmp_txt = text.replace(' ', '_')
			response = urllib2.urlopen("http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true")
			return "http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true"
	        except urllib2.URLError, e:
			response = urllib2.urlopen("http://embed-api.qwiki.com/api/v1/search.json?count=1&q="+urllib2.quote(text))
			html = response.read()
			html_eval = json.loads(html)
			if len(html_eval) > 0:
				return self._process_qwiki_results(text, html_eval)
			else:
				return []

示例#20

0

显示文件

文件： calais_api.py 项目： bluemoon/Godel

class calaisApi:
    def __init__(self):
        self.calais = Calais(KEY, submitter="GodelTest")

    @persistent_memoize
    def calais_run(self, sentence):
        entities = []
        try:
            result = self.calais.analyze(sentence)
        except ValueError:
            return
            
        if hasattr(result, "entities"):
            if len(result.entities) > 0:
                for results in result.entities:
                    entities.append(results)

        if len(entities) > 0:
            return entities
        else:
            return False

示例#21

0

显示文件

文件： calais_feature_generator.py 项目： dssg/machine_learning_legislation

def extract_entities(text, retries=5):
    """
    Input: entity_text
    Output: calais entity
    """
    import time
    sys.path.insert(
        0,
        os.path.realpath(
            os.path.abspath(
                os.path.join(
                    os.path.split(inspect.getfile(inspect.currentframe()))[0],
                    "../../ner"))))
    from calais import Calais
    random.seed(text)
    API_KEYS = [
        "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
        "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes",
        "ccw5tvhv5sewvnnnpkfa9ydn", "ne7yxpax4ebug4qz3p4jguej",
        "nsuasahckne72keu8qu6zjd3", "bvuy6mqmr7z7x8jw5f4zzpkr"
    ]
    calaises = [
        Calais(key, submitter="python-calais-demo") for key in API_KEYS
    ]
    entities = []
    calais = calaises[random.randint(0, len(calaises) - 1)]
    for i in range(retries):
        try:
            result = calais.analyze(text)
            if hasattr(result, 'entities'):
                for calais_entity in result.entities:
                    e_type = calais_entity['_type']
                    entities.append(e_type)
            return entities
        except:
            logging.exception("failed while calling calais")
            time.sleep(1)
    logging.error("failed with all tries to call calais")
    return entities

示例#22

0

显示文件

文件： assn3.py 项目： fagan2888/LinkedData_Entities

### Ariana Giorgi
### 10/31/2014
### Computational Journalism Assignment #3 - Open Calais
### https://code.google.com/p/python-calais/

from calais import Calais
import collections

#set Calais API Key and create new instance
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="python-calais demo")

#demo text
input_text = "George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now."

with open('stdin_1.txt', 'r') as f:
    input_text = f.read()
f.closed

result = calais.analyze(input_text)
#result.print_entities()

#initialize dictionary that will contain the linked data
link_list = {}
#initialize detected references count (collected for assignment)
detected_count = 0

#loop through each entity and assign a link
for i in range(len(result.entities)):
    if 'resolutions' in result.entities[i]:
        #if Calais has assigned an RDF value, use that as the link

示例#23

0

显示文件

文件： __init__.py 项目： lfborjas/Magritte

def web_extract_terms(text, raw_query='',service='tagthe'):
    """
        Given a text, extract keyword terms with the selected web_service.
        Args:
            text: raw text from where to extract terms
            query: a query which may contextualize the extraction (only used by yahoo)
            service: which web service to use
        Returns:
            query: a sequence of query terms

    """
    service = service.lower().strip()

    if not service in WEB_SERVICES.keys():
        raise ValueError('%s is an invalid web service, possible choices are %s' % (service, repr(WEB_SERVICES.keys())))

    #1. Build the query:
    query = {}
    apikey = settings.WEB_SERVICES_KEYS.get(service, '')

    if service == 'wordsfinder':
        query = {
            'apikey' : apikey,
            'context': text + raw_query,
        }
    elif service == 'alchemy':
        query = {
            'apikey' : apikey,
            'text' : text + raw_query,
            'outputMode' : 'json'
        }
    elif service == 'yahoo':
        query = {
            'appid': apikey,
            'context': text,
            'output': 'json',
        }
        if raw_query:
            query.update({'query': raw_query})
    elif service == 'tagthe':
        query = {
            'text': text + raw_query,
            'view': 'json',
        }
    elif service == 'opencalais':
        #use the python interface, obtained from:
        #http://www.opencalais.com/applications/python-calais-python-interface-opencalais-api
        #logging.debug('Using the opencalais interface with key %s' % apikey)
        s = Calais(apikey)
        try:
            res = s.analyze(text + raw_query)
        except:
            raise WebServiceException(service, 'error in request')    
        #logging.debug('The raw response: %s'  % res.raw_response)
        if hasattr(res, 'topics') or hasattr(res, 'entities'):
            retval = [t['categoryName'] for t in res.topics] if hasattr(res, 'topics') else []
            retval += [t['name'] for t in res.entities] if hasattr(res, 'entities') else []
            return retval
        else:
            #raise WebServiceException(service, 'No topics or entities found')
            #logging.info("OpenCalais didn't return topics|entities for %s" %text)
            return ["",]
#    elif service == 'extractor':
#        #use the python interface
#        #logging.debug('using the extractor interface with key %s' % apikey)
#        extractor=ExtractorService(key=apikey, url=WEB_SERVICES[service])
#        raw_response = extractor.extract(text + raw_query)
#        #logging.debug('The raw response: %s' % raw_response)
#
#        if raw_response.get('ExtractionStatus') == '-1':
#            print "failure!"
#            raise WebServiceException(service, "Failure in request")
#        else:
#            #TODO: what DOES it return?
#            return raw_response



    #2. Try to call the service:
    resp = None
    #logging.debug('requesting %s' % WEB_SERVICES[service]+'?%s'%urlencode(query))
    try:
        #HACK: tagthe has issues with POST requests, so try and do a GET
        #max length for a GET request is 2048:
        #http://stackoverflow.com/questions/1344616/max-length-of-query-string-in-an-ajax-get-request
        if service == 'tagthe': # and len(urlencode(query)) <= 2048:
            resp_url = urlopen(WEB_SERVICES[service]+'?%s'%urlencode(query))       
        else:
            #HACK: fallback to yahoo if the request is too much for tagthe
            #service = 'yahoo' if service == 'tagthe' else service            
            resp_url = urlopen(WEB_SERVICES[service], urlencode(query))
        resp = resp_url.read()
        #this causes the exception...
        #logging.debug( u"%s returned %s" % (service, resp))        
    except Exception as e:
        #TODO: retry in timeouts and stuff
        #logging.debug('Error in request: %s' % e, exc_info = True)
        raise WebServiceException(service, 'Error in request to service : %s' % e)
            
    #3. Process the response:    
    if resp:
        result = ''
        if service == 'alchemy':
            data = json.loads(resp)
            if data['status'] == 'ERROR':
                raise WebServiceException(service, 'call returned error status')
            result = [re.sub('-[^ ] ', '', e['text']) for e in data['keywords']]
            
        elif service == 'yahoo':
            data = json.loads(resp)
            result = data['ResultSet']['Result']

        elif service == 'wordsfinder':
            parsed_response = parseString(resp)
            e = parsed_response.getElementsByTagName('error')
            if e:
                raise WebServiceException(service, 'error code %s' % e.firstChild.datad)
            else:
                result = [node.firstChild.data for node in parsed_response.getElementsByTagName('keyword')]
        elif service == 'tagthe':
            data = json.loads(resp)
            if 'memes' in data and 'dimensions' in data['memes'][0] and 'topic' in data['memes'][0]['dimensions']:
                result = data['memes'][0]['dimensions']['topic']
                #logging.debug(u'tagthe result %s' %result)
            else:
                result = ['', ]
            

        return [unescape(w) for w in result]


    # TODO: maybe find a way to call 'em all and have a super-set of kws?
    else:
        return ''

示例#24

0

显示文件

文件： bills_ner.py 项目： dssg/machine_learning_legislation

            os.path.join(
                os.path.split(inspect.getfile(inspect.currentframe()))[0],
                ".."))))
from calais import Calais
import codecs
import random
import psycopg2
from util import path_tools

USAGE = "python %s <bill-version-file> <bill|report>"
API_KEYS = [
    "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
    "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes"
]
MAX_TEXT_LENGTH = 100000
calaises = [Calais(key, submitter="python-calais-demo") for key in API_KEYS]

from util import configuration
CONN_STRING = configuration.get_connection_string()


class Entity:
    def __str__(self):
        return "%s | %s | %s | %d:%d" % (self.text, self.name, self.type,
                                         self.offset,
                                         self.offset + self.length)


def read_file(path):
    with codecs.open(path, 'r', 'utf8') as f:
        content = f.read()

示例#25

0

显示文件

文件： simple-ex.py 项目： jannae/rwet

from calais import Calais

API_KEY = "s5mba8qn5qb4vjmc663qxn8m"
calais = Calais(API_KEY, submitter="jannae")

result = calais.analyze("George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now.")

result.print_summary()

示例#26

0

显示文件

文件： main.py 项目： shujianbu/Open_Calais_NER

for filename in file:
    fout = open(("results/" + filename + ".html"), "w")
    fout.write('<html>')
    fout.write('\r\n')
    fout.write('<head><title>' + filename + '</title></head>')
    fout.write('\r\n')
    fout.write('<body>')

    with open(("articles/" + filename + ".txt"), "r") as myfile:
        sys.stdin = myfile
        content = ""
        for line in sys.stdin:
            content += line

        API_KEY = "f7vhuv2kt4fxufuvv6eznwpe"
        calais = Calais(API_KEY, submitter="python-calais newsparser")
        result = calais.analyze(content)

        print "Summary of the Calais Analysis"
        result.print_summary()

        print "Entity of the Calais Analysis"
        result.print_entities()

        i = 0
        temp = []
        entityList = []
        html = []
        for entity in result.entities:
            if result.entities[i]["_type"] in [
                    "City", "Company", "Country", "Movie", "Organization",

示例#27

0

显示文件

#!/usr/bin/env python

import sys
"""
Initialize requirements for OpenCalais
"""

from calais import Calais

CALAIS_API_KEY = 'ed42bg3ku3g3k98kv9kee78s'
calais = Calais(CALAIS_API_KEY, submitter="pagea1 tester")


def body2entities(body):
    """
    Given an article (STRING body), use the Open Calais named entity recognizer to return
    all entities therein.
    """
    names, companies, orgs, terms = [], [], [], []
    result = calais.analyze(body)
    for entity in result.entities:
        if (entity["_type"] == "Person"):
            names.append(entity["name"])
        if (entity["_type"] == "Company"):
            companies.append(entity["name"])
        if (entity["_type"] == "Organization"):
            orgs.append(entity["name"])
        if (entity["_type"] == "IndustryTerm"):
            terms.append(entity["name"])
    return names, companies, orgs, terms

示例#28

0

显示文件

文件： history-analyze.py 项目： savannahniles/HistorySnooper

import re
import sys
from calais import Calais #https://code.google.com/p/python-calais/

CALAIS_API_KEY = "an5duh4ktc5twbfysaakjhxs"
calais = Calais(CALAIS_API_KEY, submitter="historySnooper test")

f = open('history-sample.csv')
lines = f.readlines()
f.close()

class Topic:

	def __init__(self, name, value): #makes 1 call to users
		self.name = name
		self.value = value

class Entity:

	def __init__(self, name, value): #makes 1 call to users
		self.name = name
		self.value = value

topicsList = []
entitiesList = []
#id,lastVisitTime,title,typedCount,url,visitCount

lines.pop(0)
for line in lines:
	line = line.rstrip('\n')
	data = line.split(',')

示例#29

0

显示文件

文件： Code.py 项目： shivangis1/NLP_Work

import io,sys
from calais import Calais
API_KEY = "rg72c9882ypfjm24tvrfu6ab"
calais = Calais(API_KEY, submitter="python-calais demo")

##read text file
with open('file3.txt', 'r') as content_file:
    content = content_file.read()

##perform analysis on text
result = calais.analyze(content)
result.print_entities()

html_file = open("HTMLFile3.html", "w")

##the resulting entities obtained are not sorted, so we sort them here:
def comp(x,y):
    return y['instances'][0]['offset'] - x['instances'][0]['offset']
sorted_results = sorted (result.entities,cmp=comp)

b = content.decode("utf-8")

for i in range (len(sorted_results)):   #for each entity
    offset = sorted_results[i]['instances'][0]['offset']    #find offset
    length = sorted_results[i]['instances'][0]['length']    #find length
    total = offset + length                                 #find total length

    if offset != sorted_results[i-1]['instances'][0]['offset']: #to prevent same words being linked twice
        if 'resolutions' in sorted_results[i]:  #if rdf document exists
            link = sorted_results[i]['resolutions'][0]['id']
            data = "<a href = \"" + link + "\" target=\"_blank\">" + sorted_results[i]['name'] + "</a>"

示例#30

0

显示文件

文件： hotlist2.py 项目： globelab/61fresh

		links_hash[real_url]['first_tweeted'] = min(links_hash[real_url]['first_tweeted'],link['first_tweeted'])
		links_hash[real_url]['total_tweets'] = links_hash[real_url]['total_tweets']+link['total_tweets']
		links_hash[real_url]['weighted_tweets'] = links_hash[real_url]['weighted_tweets']+link['weighted_tweets']
		calculateHotness(links_hash[real_url])
		links_hash[real_url]['tweeters'].extend(link['tweeters'])
		links_hash[real_url]['tweeters'].sort(key=lambda x: x['followers_count'],reverse=True)
	else:
		links_hash[real_url] = link

links = links_hash.values()

links.sort(key=lambda x: x['hotness'],reverse=True)


if not opts.min and not opts.no_classify:
	calais = Calais("***REMOVED***", submitter="python-calais classify")
	with open('savedclassifier.pickle','rb') as pkfile:
		classifier = pickle.load(pkfile)

for link in links:
	embedly = json.loads(link['embedly_blob'])

	if (not opts.min and not opts.no_classify) and link['sports_score'] is None:
		analysetext = ' '.join([embedly.get(x,'') for x in ['title', 'description', 'url'] if embedly.get(x,'') is not None])
		analysetext.encode("utf8")
		analysetext = analysetext.encode("utf8")
		analysetext= analysetext.replace('"', '\'')

		#core features extracted from classifier runs
		sportslist=['sports','nesn','weei','espn',#super types
			'Baseball','Hockey','Basketball','Football',#sports types

示例#31

0

显示文件

文件： extract_entities.py 项目： bishwasmishra/Open_Calais

from calais import Calais
API_KEY = "djgq52vv8uufzykmnb9g7myv"
calais = Calais(API_KEY, submitter ="python-calais demo")
result = calais.analyze('''Microsoft is a big company. George Bush was the President of the United States
of America until 2009.  Barack Obama is the new President of the United States now.''')
result2 = calais.analyze('''Microsoft is a big company''')
result3 = calais.analyze('''Troubled drug development company SFBC International said on Monday it has changed its name to PharmaNet Development Group Inc. and will be traded on Nasdaq under the stock symbol "PDGI".''')
d={}
a=[]
for i in result3.entities:
    if i["_type"]=="Technology":
        a.append(i["name"])
        d["Technology"]=a;
        print d

示例#32

0

显示文件

from calais import Calais

API_KEY = "k6s6cewwwc5zkemjqpw7yhru"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze("michelle obama")

print result.print_summary()
print result.print_topics()
print result.print_relations()

示例#33

0

显示文件

文件： main.py 项目： kdimatteo/finint

def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()

示例#34

0

显示文件

from bs4 import BeautifulSoup
from calais import Calais
import re
import dateutil.parser as dparser
from datetime import datetime
import unicodedata

#api key for calais
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="Parsing TRACE Files")


def replace_accented(input_str):
    #from baseline.py
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


for num in range(1, 172):
    print num

    with open("files/id" + str(num) + ".txt", 'r') as f:
        text = f.read()
    f.closed

    soup = BeautifulSoup(text)

    if len(soup.find_all('div', class_='msgBody')) == 0:
        # print "Not a case" #only files containing a div class with the name "msgBody" are non case files.
        # else: #now just the cases
        #PERP COMPANY

示例#35

0

显示文件

文件： country_origin.py 项目： maple7sha/studentorigin_opencalais

def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['(TOTAL)'] = 0
  country_count['united states'] = 0
  country_count['~origin unknown'] = 0
  count = 0
  

  for pair in email_addr:
    check = 0
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
        # if the addr contains country information
        if hasattr(response, 'entities'):
          # entry is a list of 3 elements: priority (3 for ProvinceOrState, 2 for Country, 1 for EmailAddress ), Country, Province 
          entry = [-1, '', '']
          for each in response.entities:
            if each['_type'] == 'ProvinceOrState':
              
              try: 
                entry[1] = each['resolutions'][0]['containedbycountry'].lower()
                entry[0] = 3
                entry[2] = each['name'].lower()
              except KeyError:
                print 'Country name cannot be retrieved'

            elif each['_type'] == 'Country':
              if entry[0] < 2:
                entry[0] = 2
                entry[1] = each['name'].lower()
            elif each['_type'] == 'EmailAddress':
              if entry[0] < 1:
                entry[0] = 1

          if entry[0] == 3:
            name = '(US) - ' + entry[2]
            if entry[1] not in country_count:
              country_count[entry[1]] = 1
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]
              if entry[1] == 'united states':
                if name not in country_count:
                  country_count[name] = 1 
                else:
                  country_count[name] = 1 + country_count[name] 

          elif entry[0] == 2:
            if entry[1] not in country_count:
              country_count[entry[1]] = 1   
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]

          elif entry[0] == 1:
            check = 1 # go through email check

          else:
            country_count['~origin unknown'] = country_count['~origin unknown'] + 1

      else: 
        check = 1

      # if addr is empty, query email address mapping table; if no entry, Unknown add 1
      # here we assume that all entries without addr and without strong indication of country origins in their emails will be categorized under the USA entry
      if check == 1: 
        # determine entry name
        email_endstr = pair[0].split('.')[-1].lower()
        if email_endstr in country_code:
          name = country_code[email_endstr]
        else:
          name = '~origin unknown'  
        # add entry 
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1

    except ValueError:
      print 'Calais could not handle the language'
      country_count['~origin unknown'] = country_count['~origin unknown'] + 1
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  country_count['(TOTAL)'] = count
  country = sorted(country_count)

  print country
  us = 0
  with open('origin.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country:
      if key != 'united states':
        a.writerow([key, country_count[key]])
      if us == 0:
        a.writerow(['united states', country_count['united states']])
        us = 1

示例#36

0

显示文件

文件： edX_country_origin.py 项目： maple7sha/studentorigin_opencalais

def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['united states'] = 0
  count = 0
  
  for pair in email_addr:
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
      # if the addr contains country information
      name = ''
      if hasattr(response, 'entities'):
        print response.entities
        name = response.entities[0]['containedbycountry'].lower()
        if '@' in name: #where email addresses are wrongly entered as addr
          last_str = pair[0].split('.')[-1].lower()
          if last_str in country_code:
            name = country_code[last_str]
          else:
            name = 'united states'
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
        else:
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
      # otherwise, check the email addr    
      else:
        last_str = pair[0].split('.')[-1].lower()
        if last_str in country_code:
          name = country_code[last_str]
        else:
          name = 'united states'
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1
    except ValueError:
      print 'Calais could not handle the language'
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  print country_count

  with open('countrybreakdown.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country_count:
      a.writerow([key, country_count[key]])

示例#37

0

显示文件

文件： __init__.py 项目： uclastudentmedia/django-supertagging

            return
        
    process_type = settings.DEFAULT_PROCESS_TYPE
    if 'contentType' in settings.PROCESSING_DIR:
        d_proc_type = proc_dir['contentType']

    if 'fields' not in params:
        if settings.ST_DEBUG:
            raise Exception('No "fields" found.')
        else:
            return

    # Create the instance of Calais and setup the parameters,
    # see open-calais.com for more information about user directives,
    # and processing directives
    c = Calais(settings.API_KEY)
    c.user_directives.update(settings.USER_DIR)
    c.processing_directives.update(settings.PROCESSING_DIR)
    c.processing_directives['contentType'] = process_type

    processed_tags = []
    for item in params['fields']:
        try:
            d = item.copy()
            
            field = d.pop('name')
            proc_type = d.pop('process_type', process_type)
            markup = d.pop('markup', False)

            data = getattr(obj, field)

示例#38

0

显示文件

文件： UserProfileGenerator.py 项目： royyeah/wse

score float NOT NULL,
PRIMARY KEY (ID)
);""")
cursor.execute("""CREATE TABLE tweet_entity
(
ID int NOT NULL AUTO_INCREMENT,
tweet_id bigint(20) NOT NULL,
entity varchar(255) NOT NULL,
relevance float NOT NULL,
PRIMARY KEY (ID)
);""")
q_topic_insert = "INSERT INTO tweet_topic (tweet_id,topic,score) VALUES (%s,%s,%s);"
q_entity_insert = "INSERT INTO tweet_entity (tweet_id,entity,relevance) VALUES (%s,%s,%s);"

# Configure Calais connection
calais = Calais(config.calais_api_key, submitter=config.calais_user)

userprofiles = {}
last_time = time.time()

for username in usernames:
    if limit == None:
        cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******'")
    else:
        cursor.execute("SELECT id, content FROM tweets_sample WHERE username='******' LIMIT " + str(limit))
    data = cursor.fetchall()
    
    analyzed = 0
    skipped = 0
    topics = {}
    no_topics = 0

示例#39

0

显示文件

文件： classifyfreshlib.py 项目： globelab/61fresh

def classify(real_url_hash):
   
   API_KEY = "***REMOVED***"
   '''
   # Open database connection
   db = MySQLdb.connect(
       host = '127.0.0.1', 
       user = '******', 
       passwd = 'globelab', 
       db = 'condor', 
       port = 3307)
   '''
   try:
	with open('config-local.json') as fh:
		config = json.load(fh)
   except IOError:
	with open('config.json') as fh:
		config = json.load(fh)

   db = MySQLdb.connect(
	host=config['mysql']['host'],
	port=config['mysql']['port'],
 	user=config['mysql']['user'],
 	passwd=config['mysql']['password'],
 	db=config['mysql']['database'],
	use_unicode=True,
    	charset="utf8")
  
   db.autocommit(True)

   #call calais function
   calais = Calais(API_KEY, submitter="python-calais classify")

   #run from train set
   pkfile = open('savedclassifier.pickle','rb')
   classifier = pickle.load(pkfile)

   # prepare a cursor object using cursor() method
   cursor = db.cursor()

   # Prepare SQL query to INSERT a record into the database.
   sql = "SELECT embedly_blob,real_url_hash  FROM  url_info WHERE real_url_hash='" + real_url_hash + "' LIMIT 1;" 

   # Execute the SQL command
   cursor.execute(sql)

   # Fetch all the rows
   results = cursor.fetchall()

   #browse through results
   for row in results:

      #get results
      real_url_hash = row[1]
      jsondecode = json.loads(row[0])
      title = jsondecode['title']
      description = jsondecode['description']
      url = jsondecode['url']

      if title and description and url:
         analysetext =  title + ' ' + description + ' ' + url
      else:
         analysetext = ' '
         if title:
            analysetext = analysetext  + ' ' +  title
         if description:
            analysetext = analysetext  + ' ' +  description
         if url:
            analysetext = analysetext  + ' ' +  url
            
      analysetext.encode("utf8")
      analysetext = analysetext.encode("utf8")
      analysetext= analysetext.replace('"', '\'')
      
      
      #classifier 1
      #naive bayes
      _topic = classifier.classify(analysetext)
      _score = classifier.get_score(analysetext)
      
      #classifier 2
      #opencalais crosscheck   
      try:
         result = calais.analyze(analysetext)
         topic =  result.get_topics()
         score =  result.get_topic_score()
      except:
         topic = "None"
         score = 0.0
         sqlerr = 'INSERT INTO error_classify (real_url_hash, text) VALUES("' + real_url_hash+ '","' + analysetext + '");'
         cursor.execute(sqlerr)
         
      print real_url_hash
      print analysetext
      print topic
      print _topic
      
      #create json output      
      jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) 
      sqlupdate = "UPDATE url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';"
      x = cursor.execute(sqlupdate)
      trace =  'trace: updated url_info, url hash [%s]: %d' % (real_url_hash, x)

      #update score
      if topic == "Sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      elif topic == "None" and _topic == "sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(_score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      else:
         cursor.execute("UPDATE url_info SET sports_score='0' WHERE real_url_hash=\'" + real_url_hash + "\';")

   db.close()
   pkfile.close()

示例#40

0

显示文件

            if leftToWait>0:
                time.sleep(leftToWait)
            ret = func(*args,**kargs)
            lastTimeCalled[0] = time.clock()
            return ret
        return rateLimitedFunction
    return decorate


Manager = QueryManager()
# Calais Rate limits our analysis requests, 
# this throttles requests without needing to sleep
# AnalysisQueue = queue.Queue(.4);
# AnalysisQueue.execute()
key = "c3wjfrkfmrsft3r5wgxm5skr"
CalaisObj = Calais(key, submitter="Sam Purcell")

def pr(*args):
  print args[0] % (len(args) > 1 and args[1:] or [])
  sys.stdout.flush()

def tryConnection (applyfun): 
    try:
        return applyfun()
    except exc.SQLAlchemyError:
        db.session.rollback()
        return applyfun()

class News():
    normalizers = {
        "feedzilla" : {

示例#41

0

显示文件

文件： poem.py 项目： jannae/rwet

from calais import Calais

API_KEY = "s5mba8qn5qb4vjmc663qxn8m"
calais = Calais(API_KEY, submitter="jannae")

result = calais.analyze_file("sotu2012.txt")

result.print_summary()

def print_entities(self):
    if not hasattr(self, "entities"):
        return None
    for item in self.entities:
        print "%s: %s (%.2f)" % (item['_type'], item['name'], item['relevance'])

示例#42

0

显示文件

文件： NYTOpen.py 项目： rashmi-raman/Playing-around-with-Open-Calais

from calais import Calais
import os

API_KEY = "v5q6rvm7h4uww6sumjxuw9t7"
calais = Calais(API_KEY, submitter="python-calais demo")

f = open("Text/test.txt", 'r+')
NYT = f.read()
f.close()

result = calais.analyze(NYT)

result.print_summary()
result.print_entities()

示例#43

0

显示文件

def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()

示例#44

0

显示文件

from calais import Calais
API_KEY = "dkm645ejqmq7aajt8cp6zxk7"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze(
    "My 15 year old Daughter has sores in her genital area and her mouth.She swares she did nothing but kiss her boyfriend.She also has flu-like symptoms."
)
result.print_summary()
result.print_entities()
result.print_topics()
p = raw_input()