def get_calais_subjects(text, uid): registry = getUtility(IRegistry) settings = registry.forInterface(ITagHelperSettingsSchema) api_key = settings.calais_api_key relevance = settings.calais_relevance subjects=[] if api_key: calais = Calais(api_key) try: result = calais.analyze(text, external_id = uid) except: return [] #if hasattr( result, 'entities'): # for entity in result.entities: # if entity['_type'] in PREFERRED_ENTITIES: # subjects.append(entity['name']) if hasattr( result, 'socialTag'): for tag in result.socialTag: if float(tag['importance']) > relevance: subjects.append(tag['name']) #if hasattr( result, 'relations'): # for fact in result.relations: # if fact['_type'] in PREFERRED_FACTS: # ft = fact.get(fact['_type'].lower()) # if ft: # subjects.append(ft) return subjects
def __init__(self): self.db_name = "pythia_db" self.connections = { "db": connect(self.db_name), "calais": Calais("av536xwvy4mgmcbw9cancqmd", submitter="pythia-application") }
def __init__(self, content_object, content_fields=None): super(OpenCalais, self).__init__(content_object) self.calais = Calais(settings.CALAIS_API_KEY, settings.CALAIS_SUBMITTER) if content_fields: self.calais_content_fields = content_fields else: try: self.calais_content_fields = dict( self.content_object.__class__.calais_content_fields) except FieldDoesNotExist, e: raise OpenCalaisTagFetchError( 'You need to define calais_content_fields: %s' % e)
def entities(env, start_response): """Extracts entities from resume utilizing the OpenCalais webservice.""" start_response('200 OK', [('Content-Type', 'text/xml')]) API_KEY = "kqyhhfppufvmvxspkstwjxw5" calais = Calais(API_KEY, submitter="resume_analysis") try: with open('Darin_Plutchok_Resume_Taxonomist.txt') as f: text = f.read() except: raise restlite.Status, '400 Error Reading File' try: results = calais.analyze(text) except Exception as e: return "<error>%s</error>" % e entities_tuples = [(entity['name'], entity['_type'], entity['relevance']) for entity in results.entities] doc = create_xml({'entities': entities_tuples}) return [str(doc.toxml())]
def extract_entities(text, retries=5): """ Input: entity_text Output: calais entity """ import time sys.path.insert( 0, os.path.realpath( os.path.abspath( os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "../../ner")))) from calais import Calais random.seed(text) API_KEYS = [ "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama", "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes", "ccw5tvhv5sewvnnnpkfa9ydn", "ne7yxpax4ebug4qz3p4jguej", "nsuasahckne72keu8qu6zjd3", "bvuy6mqmr7z7x8jw5f4zzpkr" ] calaises = [ Calais(key, submitter="python-calais-demo") for key in API_KEYS ] entities = [] calais = calaises[random.randint(0, len(calaises) - 1)] for i in range(retries): try: result = calais.analyze(text) if hasattr(result, 'entities'): for calais_entity in result.entities: e_type = calais_entity['_type'] entities.append(e_type) return entities except: logging.exception("failed while calling calais") time.sleep(1) logging.error("failed with all tries to call calais") return entities
def _get_people(text): ''' Runs input text through Calais to extract people, coreferences and their locations. This function returns a canonical name for any given source in the document and contextual information about where coreferences appear, based on the text before and after the pronoun occurrance. Takes full story text as input. This is a pretty bare-bones function. It doesn't handle Calais API errors, so it tends to crap out from time to time. Future refinements should account for this. ''' # Run input text through Calais calais = Calais(API_KEY, submitter="tbc-coref-test") annotations = calais.analyze(text) # If no entities come back, peace out if not hasattr(annotations, 'entities'): return False coref = {} # Dictionary to hold our corefence object. for e in annotations.entities: instances = [] # We only care about Person entities, not companies, places, etc. if e['_type'] == 'Person': # For each instance of that entity (which includes pronouns and other references) ... for i in e['instances']: # Collect the coreference text (exact) the preceding text (prefix) and the # following text (suffix) for reference information. We'll need this later. instances.append( (i.get('exact'), i.get('suffix', ''), i.get('prefix', ''))) # Associate the canonical name with the coreference and context information gathered # above for use later. name = e.get("commonname", e.get('name', None)) coref[name] = instances return coref
### Ariana Giorgi ### 10/31/2014 ### Computational Journalism Assignment #3 - Open Calais ### https://code.google.com/p/python-calais/ from calais import Calais import collections #set Calais API Key and create new instance API_KEY = "g8gnzpdz52gkwyduv75zecem" calais = Calais(API_KEY, submitter="python-calais demo") #demo text input_text = "George Bush was the President of the United States of America until 2009. Barack Obama is the new President of the United States now." with open('stdin_1.txt', 'r') as f: input_text = f.read() f.closed result = calais.analyze(input_text) #result.print_entities() #initialize dictionary that will contain the linked data link_list = {} #initialize detected references count (collected for assignment) detected_count = 0 #loop through each entity and assign a link for i in range(len(result.entities)): if 'resolutions' in result.entities[i]: #if Calais has assigned an RDF value, use that as the link
if leftToWait>0: time.sleep(leftToWait) ret = func(*args,**kargs) lastTimeCalled[0] = time.clock() return ret return rateLimitedFunction return decorate Manager = QueryManager() # Calais Rate limits our analysis requests, # this throttles requests without needing to sleep # AnalysisQueue = queue.Queue(.4); # AnalysisQueue.execute() key = "c3wjfrkfmrsft3r5wgxm5skr" CalaisObj = Calais(key, submitter="Sam Purcell") def pr(*args): print args[0] % (len(args) > 1 and args[1:] or []) sys.stdout.flush() def tryConnection (applyfun): try: return applyfun() except exc.SQLAlchemyError: db.session.rollback() return applyfun() class News(): normalizers = { "feedzilla" : {
from bs4 import BeautifulSoup from calais import Calais import re import dateutil.parser as dparser from datetime import datetime import unicodedata #api key for calais API_KEY = "g8gnzpdz52gkwyduv75zecem" calais = Calais(API_KEY, submitter="Parsing TRACE Files") def replace_accented(input_str): #from baseline.py nkfd_form = unicodedata.normalize('NFKD', input_str) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) for num in range(1, 172): print num with open("files/id" + str(num) + ".txt", 'r') as f: text = f.read() f.closed soup = BeautifulSoup(text) if len(soup.find_all('div', class_='msgBody')) == 0: # print "Not a case" #only files containing a div class with the name "msgBody" are non case files. # else: #now just the cases #PERP COMPANY
def get_entities(content): API_KEY = os.environ['API_KEY'] calais = Calais(API_KEY, submitter="python-calais demo") result = calais.analyze(content) result.print_entities()
os.path.join( os.path.split(inspect.getfile(inspect.currentframe()))[0], "..")))) from calais import Calais import codecs import random import psycopg2 from util import path_tools USAGE = "python %s <bill-version-file> <bill|report>" API_KEYS = [ "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama", "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes" ] MAX_TEXT_LENGTH = 100000 calaises = [Calais(key, submitter="python-calais-demo") for key in API_KEYS] from util import configuration CONN_STRING = configuration.get_connection_string() class Entity: def __str__(self): return "%s | %s | %s | %d:%d" % (self.text, self.name, self.type, self.offset, self.offset + self.length) def read_file(path): with codecs.open(path, 'r', 'utf8') as f: content = f.read()
for filename in file: fout = open(("results/" + filename + ".html"), "w") fout.write('<html>') fout.write('\r\n') fout.write('<head><title>' + filename + '</title></head>') fout.write('\r\n') fout.write('<body>') with open(("articles/" + filename + ".txt"), "r") as myfile: sys.stdin = myfile content = "" for line in sys.stdin: content += line API_KEY = "f7vhuv2kt4fxufuvv6eznwpe" calais = Calais(API_KEY, submitter="python-calais newsparser") result = calais.analyze(content) print "Summary of the Calais Analysis" result.print_summary() print "Entity of the Calais Analysis" result.print_entities() i = 0 temp = [] entityList = [] html = [] for entity in result.entities: if result.entities[i]["_type"] in [ "City", "Company", "Country", "Movie", "Organization",
#!/usr/bin/env python import sys """ Initialize requirements for OpenCalais """ from calais import Calais CALAIS_API_KEY = 'ed42bg3ku3g3k98kv9kee78s' calais = Calais(CALAIS_API_KEY, submitter="pagea1 tester") def body2entities(body): """ Given an article (STRING body), use the Open Calais named entity recognizer to return all entities therein. """ names, companies, orgs, terms = [], [], [], [] result = calais.analyze(body) for entity in result.entities: if (entity["_type"] == "Person"): names.append(entity["name"]) if (entity["_type"] == "Company"): companies.append(entity["name"]) if (entity["_type"] == "Organization"): orgs.append(entity["name"]) if (entity["_type"] == "IndustryTerm"): terms.append(entity["name"]) return names, companies, orgs, terms