示例#1
0
def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects=[]
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id = uid)
        except:
            return []
        #if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr( result, 'socialTag'):
            for tag in result.socialTag:
                if float(tag['importance']) > relevance:
                    subjects.append(tag['name'])
        #if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects
示例#2
0
    def __init__(self):
        self.db_name = "pythia_db"

        self.connections = {
            "db":
            connect(self.db_name),
            "calais":
            Calais("av536xwvy4mgmcbw9cancqmd", submitter="pythia-application")
        }
示例#3
0
    def __init__(self, content_object, content_fields=None):
        super(OpenCalais, self).__init__(content_object)
        self.calais = Calais(settings.CALAIS_API_KEY,
                             settings.CALAIS_SUBMITTER)

        if content_fields:
            self.calais_content_fields = content_fields
        else:
            try:
                self.calais_content_fields = dict(
                    self.content_object.__class__.calais_content_fields)
            except FieldDoesNotExist, e:
                raise OpenCalaisTagFetchError(
                    'You need to define calais_content_fields: %s' % e)
示例#4
0
def entities(env, start_response):
    """Extracts entities from resume utilizing the OpenCalais webservice."""

    start_response('200 OK', [('Content-Type', 'text/xml')])
    API_KEY = "kqyhhfppufvmvxspkstwjxw5"
    calais = Calais(API_KEY, submitter="resume_analysis")
    try:
        with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
            text = f.read()
    except:
        raise restlite.Status, '400 Error Reading File'
    try:
        results = calais.analyze(text)
    except Exception as e:
        return "<error>%s</error>" % e

    entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
                       for entity in results.entities]
    doc = create_xml({'entities': entities_tuples})

    return [str(doc.toxml())]
def extract_entities(text, retries=5):
    """
    Input: entity_text
    Output: calais entity
    """
    import time
    sys.path.insert(
        0,
        os.path.realpath(
            os.path.abspath(
                os.path.join(
                    os.path.split(inspect.getfile(inspect.currentframe()))[0],
                    "../../ner"))))
    from calais import Calais
    random.seed(text)
    API_KEYS = [
        "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
        "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes",
        "ccw5tvhv5sewvnnnpkfa9ydn", "ne7yxpax4ebug4qz3p4jguej",
        "nsuasahckne72keu8qu6zjd3", "bvuy6mqmr7z7x8jw5f4zzpkr"
    ]
    calaises = [
        Calais(key, submitter="python-calais-demo") for key in API_KEYS
    ]
    entities = []
    calais = calaises[random.randint(0, len(calaises) - 1)]
    for i in range(retries):
        try:
            result = calais.analyze(text)
            if hasattr(result, 'entities'):
                for calais_entity in result.entities:
                    e_type = calais_entity['_type']
                    entities.append(e_type)
            return entities
        except:
            logging.exception("failed while calling calais")
            time.sleep(1)
    logging.error("failed with all tries to call calais")
    return entities
示例#6
0
def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {}  # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append(
                    (i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref
示例#7
0
### Ariana Giorgi
### 10/31/2014
### Computational Journalism Assignment #3 - Open Calais
### https://code.google.com/p/python-calais/

from calais import Calais
import collections

#set Calais API Key and create new instance
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="python-calais demo")

#demo text
input_text = "George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now."

with open('stdin_1.txt', 'r') as f:
    input_text = f.read()
f.closed

result = calais.analyze(input_text)
#result.print_entities()

#initialize dictionary that will contain the linked data
link_list = {}
#initialize detected references count (collected for assignment)
detected_count = 0

#loop through each entity and assign a link
for i in range(len(result.entities)):
    if 'resolutions' in result.entities[i]:
        #if Calais has assigned an RDF value, use that as the link
示例#8
0
            if leftToWait>0:
                time.sleep(leftToWait)
            ret = func(*args,**kargs)
            lastTimeCalled[0] = time.clock()
            return ret
        return rateLimitedFunction
    return decorate


Manager = QueryManager()
# Calais Rate limits our analysis requests, 
# this throttles requests without needing to sleep
# AnalysisQueue = queue.Queue(.4);
# AnalysisQueue.execute()
key = "c3wjfrkfmrsft3r5wgxm5skr"
CalaisObj = Calais(key, submitter="Sam Purcell")

def pr(*args):
  print args[0] % (len(args) > 1 and args[1:] or [])
  sys.stdout.flush()

def tryConnection (applyfun): 
    try:
        return applyfun()
    except exc.SQLAlchemyError:
        db.session.rollback()
        return applyfun()

class News():
    normalizers = {
        "feedzilla" : {
示例#9
0
from bs4 import BeautifulSoup
from calais import Calais
import re
import dateutil.parser as dparser
from datetime import datetime
import unicodedata

#api key for calais
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="Parsing TRACE Files")


def replace_accented(input_str):
    #from baseline.py
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])


for num in range(1, 172):
    print num

    with open("files/id" + str(num) + ".txt", 'r') as f:
        text = f.read()
    f.closed

    soup = BeautifulSoup(text)

    if len(soup.find_all('div', class_='msgBody')) == 0:
        # print "Not a case" #only files containing a div class with the name "msgBody" are non case files.
        # else: #now just the cases
        #PERP COMPANY
示例#10
0
def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()
            os.path.join(
                os.path.split(inspect.getfile(inspect.currentframe()))[0],
                ".."))))
from calais import Calais
import codecs
import random
import psycopg2
from util import path_tools

USAGE = "python %s <bill-version-file> <bill|report>"
API_KEYS = [
    "wdbkpbpsksskkbm2rpqfm4xa", "mt5qu3e4jdrd6jpc9r9ecama",
    "k9fb7rfh7hpbfp238whuggrr", "55rared7un2pnjr23kjtctes"
]
MAX_TEXT_LENGTH = 100000
calaises = [Calais(key, submitter="python-calais-demo") for key in API_KEYS]

from util import configuration
CONN_STRING = configuration.get_connection_string()


class Entity:
    def __str__(self):
        return "%s | %s | %s | %d:%d" % (self.text, self.name, self.type,
                                         self.offset,
                                         self.offset + self.length)


def read_file(path):
    with codecs.open(path, 'r', 'utf8') as f:
        content = f.read()
示例#12
0
for filename in file:
    fout = open(("results/" + filename + ".html"), "w")
    fout.write('<html>')
    fout.write('\r\n')
    fout.write('<head><title>' + filename + '</title></head>')
    fout.write('\r\n')
    fout.write('<body>')

    with open(("articles/" + filename + ".txt"), "r") as myfile:
        sys.stdin = myfile
        content = ""
        for line in sys.stdin:
            content += line

        API_KEY = "f7vhuv2kt4fxufuvv6eznwpe"
        calais = Calais(API_KEY, submitter="python-calais newsparser")
        result = calais.analyze(content)

        print "Summary of the Calais Analysis"
        result.print_summary()

        print "Entity of the Calais Analysis"
        result.print_entities()

        i = 0
        temp = []
        entityList = []
        html = []
        for entity in result.entities:
            if result.entities[i]["_type"] in [
                    "City", "Company", "Country", "Movie", "Organization",
示例#13
0
#!/usr/bin/env python

import sys
"""
Initialize requirements for OpenCalais
"""

from calais import Calais

CALAIS_API_KEY = 'ed42bg3ku3g3k98kv9kee78s'
calais = Calais(CALAIS_API_KEY, submitter="pagea1 tester")


def body2entities(body):
    """
    Given an article (STRING body), use the Open Calais named entity recognizer to return
    all entities therein.
    """
    names, companies, orgs, terms = [], [], [], []
    result = calais.analyze(body)
    for entity in result.entities:
        if (entity["_type"] == "Person"):
            names.append(entity["name"])
        if (entity["_type"] == "Company"):
            companies.append(entity["name"])
        if (entity["_type"] == "Organization"):
            orgs.append(entity["name"])
        if (entity["_type"] == "IndustryTerm"):
            terms.append(entity["name"])
    return names, companies, orgs, terms