示例#1
0
def populate_database_with_semantic_data_from_comments(calais_api_key, db_cursor, debug):

	calais = Calais(calais_api_key, submitter='usermine')

	db_cursor.execute('SELECT id, comment FROM comments WHERE updated = 0')

	for comment_data in db_cursor.fetchall():

		id = comment_data[0]
		comment = comment_data[1]

		try:
			result = calais.analyze(comment)

			if hasattr(result, 'entities'):
				for entity in result.entities:
					entity_name = entity['name']
					db_cursor.execute('INSERT INTO entities (id, entity) VALUES (NULL, ?)', [entity_name])

			if hasattr(result, 'topics'):
				for topic in result.topics:
					topic_name = topic['categoryName']
					db_cursor.execute('INSERT INTO topics (id, topic) VALUES (NULL, ?)', [topic_name])
		except:
			if debug:
				print sys.exc_info()

		db_cursor.execute('UPDATE comments SET updated=1 WHERE id = ?', [id])

		if debug:
			print '.'
示例#2
0
def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects=[]
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id = uid)
        except:
            return []
        #if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr( result, 'socialTag'):
            for tag in result.socialTag:
                if float(tag['importance']) > relevance:
                    subjects.append(tag['name'])
        #if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects
示例#3
0
def get_calais_subjects(text, uid):
    registry = getUtility(IRegistry)
    settings = registry.forInterface(ITagHelperSettingsSchema)
    api_key = settings.calais_api_key
    relevance = settings.calais_relevance
    subjects = []
    if api_key:
        calais = Calais(api_key)
        try:
            result = calais.analyze(text, external_id=uid)
        except:
            return []
        # if hasattr( result, 'entities'):
        #    for entity in result.entities:
        #        if entity['_type'] in PREFERRED_ENTITIES:
        #            subjects.append(entity['name'])
        if hasattr(result, "socialTag"):
            for tag in result.socialTag:
                if float(tag["importance"]) > relevance:
                    subjects.append(tag["name"])
        # if hasattr( result, 'relations'):
        #    for fact in result.relations:
        #        if fact['_type'] in PREFERRED_FACTS:
        #            ft = fact.get(fact['_type'].lower())
        #            if ft:
        #                subjects.append(ft)
    return subjects
示例#4
0
    def process_calais(content, key):
        calais = Calais(key)
        response = calais.analyze(content)

        people = [entity["name"] for entity in getattr(response, "entities", []) if entity["_type"] == "Person"]

        return {"people": people}
示例#5
0
def analyze(model, text=None, backend='calais'):
  from calais import Calais
  from django.conf import settings
  calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
  if text:
    result = calais.analyze(text)
  else:
    result = calais.analyze(model.analysis_text())
  people = []
  for entity in result.entities:
    if entity['_type'] == 'Person':
      try:
        person = Person.objects.get(uri=entity['__reference'])
      except Person.DoesNotExist, e:
        person = Person()
      person.from_calais(entity)
      person.save()
      people.append(person)
示例#6
0
文件: views.py 项目: rainzoo/jigsaw
def analyze(request):
    API_KEY = 'kgmykdr862hdfhkzkuchnxkc'
    calais = Calais(API_KEY, 'python')
    try:
        result = calais.analyze(request.POST['content'])
        if result:
            return HttpResponse(json.dumps(result.get_json_entities()),
                                mimetype="application/json")
        else:
            return HttpResponse(json.dumps(""),
                                mimetype="application/json")
    except(KeyError):
        return HttpResponse(json.dumps(""),
                                mimetype="application/json")
示例#7
0
def contract(request, contract):
    response = {}
    collection = connection()
    blocks = collection.find( { 'contracts':contract}, sort=[('date', ASCENDING)] )
    
    # get last block and display if it was approved or not
   
    calais = Calais(settings.CALAIS, submitter="hello world")
    
    newbloks = []
    for block in blocks:
        block['entities'] = calais.analyze(block['string']).entities
        newbloks.append(block)
        
    response['items'] = newbloks
    response['contract'] = contract
    return render_to_response('contract.html', response)
示例#8
0
def analyze(model, text=None, backend='calais'):
    from calais import Calais
    from django.conf import settings

    calais = Calais(settings.CALAIS_API_KEY, submitter='newscredit')
    if not text:
        _text = model.analysis_text()
    else:
        _text = text

    # we cannot analyse if our total content is under 100 characters
    # after HTML cleaning. We leave OpenCalais to do this as they have
    # advanced heuristics to do it. If our text to analyse is less than
    # 100 characters, we skip the analysis.
    if len( _text ) < 100:
        return

    result = calais.analyze(_text)
    records = []

    try:
        for entity in result.entities:
            if entity['_type'] == 'Person':
                _model = Person
            elif entity['_type'] in [ 'City', 'Country', 'Continent',
                                        'Facility', 'Region' ]:
                _model = Place
            elif entity['_type'] in [ 'Organization', 'Company' ]:
                _model = Organisation
            else:
                continue

            try:
                _record = _model.objects.get(uri=entity['__reference'])
            except _model.DoesNotExist, e:
                _record = _model()
            _record.from_calais(entity)
            _record.save()
            model.add_entity(_record)
            records.append(_record)
    except AttributeError:
        # this happens if Calais throws an error. To ensure we continue
        # processing other records pass this error and return False
        return False
    return result, records
示例#9
0
class classifier():
	API_KEY = 'xyby6x47ycxj56bkkb83s9he'

	def __init__(self):
		self.calais = Calais(self.API_KEY, submitter='SocialTV Demo')

	def process(self, ip, text):
		result = self.calais.analyze(text)
		all_tags = []

		if hasattr(result, 'entities'):
			all_tags.extend(result.entities)

		if hasattr(result, 'socialTag'):
	        #all_tags.extend(result.socialTag)
			pass

		if len(all_tags) == 0:
			return []

		output = []
		for tag in all_tags:
			if tag['name'].lower() not in dirtyWords.DIRTY:
				qwiki = self._get_qwiki_url(tag['name'])
				if len(qwiki) > 0:
					img = self._get_google_image(ip, tag['name'])
					print "Tag: "+tag['name']
					output.append({'text':text ,'tag': tag['name'], 'qwiki':qwiki, 'img': img})
		return output

	
	def _get_qwiki_url(self, text):
		try:
			tmp_txt = text.replace(' ', '_')
			response = urllib2.urlopen("http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true")
			return "http://www.qwiki.com/embed/"+urllib2.quote(tmp_txt)+"?autoplay=true"
	        except urllib2.URLError, e:
			response = urllib2.urlopen("http://embed-api.qwiki.com/api/v1/search.json?count=1&q="+urllib2.quote(text))
			html = response.read()
			html_eval = json.loads(html)
			if len(html_eval) > 0:
				return self._process_qwiki_results(text, html_eval)
			else:
				return []
示例#10
0
def main():
    """ Main method """

    if not sys.stdin:
        usage()
        sys.exit(1)

    input = sys.stdin.read() # read from stdin
    input = input.replace("<", "")
    input = input.replace(">", "")

    api_key = config.read_config('calais', 'api_key')
    calais = Calais(api_key, submitter="fridakahlo")
    entities = calais.analyze(input).entities
    
    #for e in entities: print e['name'], len(e['instances']), '<br>'

    linked_text = add_links(entities, input)
    for line in linked_text.splitlines():
        print "<p>", line, "<p>"
示例#11
0
def entities(env, start_response):
	"""Extracts entities from resume utilizing the OpenCalais webservice."""

	start_response('200 OK', [('Content-Type', 'text/xml')])
	API_KEY = "kqyhhfppufvmvxspkstwjxw5"
	calais = Calais(API_KEY, submitter="resume_analysis")
	try:
		with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
			text = f.read()
	except: 
		raise restlite.Status, '400 Error Reading File'
	try:
		results = calais.analyze(text)
	except Exception as e:
		return "<error>%s</error>" % e

	entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
											  for entity in results.entities]
	doc = create_xml({'entities': entities_tuples})
	
	return [str(doc.toxml())]
示例#12
0
class calaisApi:
    def __init__(self):
        self.calais = Calais(KEY, submitter="GodelTest")

    @persistent_memoize
    def calais_run(self, sentence):
        entities = []
        try:
            result = self.calais.analyze(sentence)
        except ValueError:
            return
            
        if hasattr(result, "entities"):
            if len(result.entities) > 0:
                for results in result.entities:
                    entities.append(results)

        if len(entities) > 0:
            return entities
        else:
            return False
示例#13
0
def entities(env, start_response):
    """Extracts entities from resume utilizing the OpenCalais webservice."""

    start_response('200 OK', [('Content-Type', 'text/xml')])
    API_KEY = "kqyhhfppufvmvxspkstwjxw5"
    calais = Calais(API_KEY, submitter="resume_analysis")
    try:
        with open('Darin_Plutchok_Resume_Taxonomist.txt') as f:
            text = f.read()
    except:
        raise restlite.Status, '400 Error Reading File'
    try:
        results = calais.analyze(text)
    except Exception as e:
        return "<error>%s</error>" % e

    entities_tuples = [(entity['name'], entity['_type'], entity['relevance'])
                       for entity in results.entities]
    doc = create_xml({'entities': entities_tuples})

    return [str(doc.toxml())]
示例#14
0
def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {}  # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append(
                    (i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref
示例#15
0
def _get_people(text):
    '''
    Runs input text through Calais to extract people, coreferences and their
    locations.

    This function returns a canonical name for any given source in the document
    and contextual information about where coreferences appear, based on the
    text before and after the pronoun occurrance.

    Takes full story text as input.

    This is a pretty bare-bones function. It doesn't handle Calais API errors, so 
    it tends to crap out from time to time. Future refinements should account for this.
    '''
    # Run input text through Calais
    calais = Calais(API_KEY, submitter="tbc-coref-test")
    annotations = calais.analyze(text)

    # If no entities come back, peace out
    if not hasattr(annotations, 'entities'):
        return False

    coref = {} # Dictionary to hold our corefence object.
    for e in annotations.entities:
        instances = []
        # We only care about Person entities, not companies, places, etc.
        if e['_type'] == 'Person':
            # For each instance of that entity (which includes pronouns and other references) ...
            for i in e['instances']:
                # Collect the coreference text (exact) the preceding text (prefix) and the
                # following text (suffix) for reference information. We'll need this later.
                instances.append((i.get('exact'), i.get('suffix', ''), i.get('prefix', '')))
            # Associate the canonical name with the coreference and context information gathered
            # above for use later.
            name = e.get("commonname", e.get('name', None))
            coref[name] = instances
    return coref
示例#16
0
文件: simple-ex.py 项目: jannae/rwet
from calais import Calais

API_KEY = "s5mba8qn5qb4vjmc663qxn8m"
calais = Calais(API_KEY, submitter="jannae")

result = calais.analyze("George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now.")

result.print_summary()
示例#17
0
from calais import Calais
import collections

#set Calais API Key and create new instance
API_KEY = "g8gnzpdz52gkwyduv75zecem"
calais = Calais(API_KEY, submitter="python-calais demo")

#demo text
input_text = "George Bush was the President of the United States of America until 2009.  Barack Obama is the new President of the United States now."

with open('stdin_1.txt', 'r') as f:
    input_text = f.read()
f.closed

result = calais.analyze(input_text)
#result.print_entities()

#initialize dictionary that will contain the linked data
link_list = {}
#initialize detected references count (collected for assignment)
detected_count = 0

#loop through each entity and assign a link
for i in range(len(result.entities)):
    if 'resolutions' in result.entities[i]:
        #if Calais has assigned an RDF value, use that as the link
        name = result.entities[i]['name']
        link = result.entities[i]['resolutions'][0]['id']
        link_list[name] = link
示例#18
0
 analyzed = 0
 skipped = 0
 topics = {}
 no_topics = 0
 named_entities = {}
 no_named_entities = 0
 
 for tweet in data:
     try:
         analyzed += 1
         
         # Wait until wait time is passed and let Calais analyze a new tweet
         time_passed = time.time() - last_time
         wait_time = max(wait_per_request - time_passed, 0)
         time.sleep(wait_time)   
         result = calais.analyze(tweet[1])
         last_time = time.time()
         
         # Extract entities
         try:
             for entity in result.entities:
                 name = entity["name"]
                 if (name == u'RT') | (entity["_type"] == u'URL'):
                     continue
                 cursor.execute(q_entity_insert, (str(tweet[0]), name, str(entity["relevance"])))
                 db.commit()
                 if name in named_entities:
                     named_entities[name] += 1
                 else:
                     named_entities[name] = 1
         except AttributeError:
示例#19
0
def analyze_url(request):
	# Insert the article into the database if it's new
	a, created = Article.objects.get_or_create(
		url = request.POST['url']
	)
	a.save()
	
	# List of tags for UI
	tag_list = []

	# If the article is new, then get it's tags
	if created:
		# Use Goose to get the article text
		g = Goose()
		article_text = g.extract(url = a.url).cleaned_text.encode('utf-8')

		# Use python-calais to analyze the article text
		calais = Calais(settings.OC_API_KEY)
		result = calais.analyze(article_text)

		
		
		try:
			# Create a dictionary to store all the entities and relevance scores
			# Get rid of all duplicates
			entity_dict = dict([])
			for entity in result.entities:
				if not entity_dict.has_key(entity['name']):
					entity_dict[entity['name']] = entity['relevance']	

			# Sort the entities and topics by confidence/relavence
			entity_list = sorted(entity_dict.iteritems(), key=itemgetter(1), reverse=True)

			# Insert the OpenCalais entities and topics with confidence > .3 into the database and add to a list for UI
			for tag, confidence in entity_list:
				if confidence > .3:
					t = Tag(
						article = a,
						tag = tag,
						confidence = confidence,
						service = Tag.OPEN_CALAIS,
					)
					t.save()
					tag_list.append(t)
				else:
					break
		except KeyError:
			print 'No entities found'

		try:
			# Create a dictionary to store all the topics and relevance scores
			# Get rid of all duplicates
			topic_dict = dict()
			for topic in result.simplified_response['topics']:
				if not topic_dict.has_key(topic['categoryName']):
					topic_dict[topic['categoryName']] = topic['score']

			# Sort topics by relavence
			topic_list = sorted(topic_dict.iteritems(), key=itemgetter(1), reverse=True)

			for tag, confidence in topic_list:
				if confidence > .3:
					t = Tag(
						article = a,
						tag = tag,
						confidence = confidence,
						service = Tag.OPEN_CALAIS,
					)
					t.save()
					tag_list.append(t)
				else:
					break
		except KeyError:
			print 'No topics found'

		###
		# TODO: Add AlchemyAPI support here
		###

	# If the article is old, just get the tags from the database
	else:
		tag_list = list(Tag.objects.filter(article = a))

	shuffle(tag_list)

	# Context for the template
	c = {
		'tags': tag_list
	}

	return render(request, 'tag_list.html', c)
示例#20
0
	
		#feature counter >= 2 works
		feature_counter = 0
		for entry in sportslist:
			if analysetext.lower().find(entry.lower()) >= 0:
				feature_counter += 1

		#classifier 1
		#naive bayes
		_topic = classifier.classify(analysetext)
		_score = classifier.get_score(analysetext)

		#classifier 2
		#opencalais crosscheck   
		try:
			result = calais.analyze(analysetext)
			topic =  result.get_topics()
			score =  result.get_topic_score()
		except:
			topic = "None"
			score = 0.0

		classifier_json = json.dumps({'topic':topic,'score':score,'_topic':_topic,'_score':_score})
		if topic == "Sports" and feature_counter < 2:
			sports_score = str(score)
		elif topic == "None" and _topic == "sports" and feature_counter < 2:
			sports_score = str(_score)
		elif feature_counter >= 2:
			sports_score = '1.0'
		else:
			sports_score = '0.0'
	htmlname = "Text/mirror.html";
elif fid ==4:
	fname = "Text/independent.txt";
	htmlname = "Text/independent.html";
else:
	fname = "Text/dailymail.txt";
	htmlname = "Text/dailymail.html";


f = open(fname, 'r+')
article = f.read()
f.close()

filter = ['Currency', 'IndustryTerm' , 'MedicalCondition']

result = calais.analyze(article)

#result.print_summary()
result.print_entities()

article = article.decode('ascii', 'replace').encode('ascii', 'replace')
enriched_article = article;

for entity in result.entities:
	if entity['_type'] not in filter:
		name = entity['name']
		print "Resolving " + name
		name = name.decode('ascii', 'replace').encode('ascii', 'replace')
		str = ""
		try:
			link = entity['resolutions'][0]['id']
示例#22
0
文件: main.py 项目: kdimatteo/finint
def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()
示例#23
0
def web_extract_terms(text, raw_query='',service='tagthe'):
    """
        Given a text, extract keyword terms with the selected web_service.
        Args:
            text: raw text from where to extract terms
            query: a query which may contextualize the extraction (only used by yahoo)
            service: which web service to use
        Returns:
            query: a sequence of query terms

    """
    service = service.lower().strip()

    if not service in WEB_SERVICES.keys():
        raise ValueError('%s is an invalid web service, possible choices are %s' % (service, repr(WEB_SERVICES.keys())))

    #1. Build the query:
    query = {}
    apikey = settings.WEB_SERVICES_KEYS.get(service, '')

    if service == 'wordsfinder':
        query = {
            'apikey' : apikey,
            'context': text + raw_query,
        }
    elif service == 'alchemy':
        query = {
            'apikey' : apikey,
            'text' : text + raw_query,
            'outputMode' : 'json'
        }
    elif service == 'yahoo':
        query = {
            'appid': apikey,
            'context': text,
            'output': 'json',
        }
        if raw_query:
            query.update({'query': raw_query})
    elif service == 'tagthe':
        query = {
            'text': text + raw_query,
            'view': 'json',
        }
    elif service == 'opencalais':
        #use the python interface, obtained from:
        #http://www.opencalais.com/applications/python-calais-python-interface-opencalais-api
        #logging.debug('Using the opencalais interface with key %s' % apikey)
        s = Calais(apikey)
        try:
            res = s.analyze(text + raw_query)
        except:
            raise WebServiceException(service, 'error in request')    
        #logging.debug('The raw response: %s'  % res.raw_response)
        if hasattr(res, 'topics') or hasattr(res, 'entities'):
            retval = [t['categoryName'] for t in res.topics] if hasattr(res, 'topics') else []
            retval += [t['name'] for t in res.entities] if hasattr(res, 'entities') else []
            return retval
        else:
            #raise WebServiceException(service, 'No topics or entities found')
            #logging.info("OpenCalais didn't return topics|entities for %s" %text)
            return ["",]
#    elif service == 'extractor':
#        #use the python interface
#        #logging.debug('using the extractor interface with key %s' % apikey)
#        extractor=ExtractorService(key=apikey, url=WEB_SERVICES[service])
#        raw_response = extractor.extract(text + raw_query)
#        #logging.debug('The raw response: %s' % raw_response)
#
#        if raw_response.get('ExtractionStatus') == '-1':
#            print "failure!"
#            raise WebServiceException(service, "Failure in request")
#        else:
#            #TODO: what DOES it return?
#            return raw_response



    #2. Try to call the service:
    resp = None
    #logging.debug('requesting %s' % WEB_SERVICES[service]+'?%s'%urlencode(query))
    try:
        #HACK: tagthe has issues with POST requests, so try and do a GET
        #max length for a GET request is 2048:
        #http://stackoverflow.com/questions/1344616/max-length-of-query-string-in-an-ajax-get-request
        if service == 'tagthe': # and len(urlencode(query)) <= 2048:
            resp_url = urlopen(WEB_SERVICES[service]+'?%s'%urlencode(query))       
        else:
            #HACK: fallback to yahoo if the request is too much for tagthe
            #service = 'yahoo' if service == 'tagthe' else service            
            resp_url = urlopen(WEB_SERVICES[service], urlencode(query))
        resp = resp_url.read()
        #this causes the exception...
        #logging.debug( u"%s returned %s" % (service, resp))        
    except Exception as e:
        #TODO: retry in timeouts and stuff
        #logging.debug('Error in request: %s' % e, exc_info = True)
        raise WebServiceException(service, 'Error in request to service : %s' % e)
            
    #3. Process the response:    
    if resp:
        result = ''
        if service == 'alchemy':
            data = json.loads(resp)
            if data['status'] == 'ERROR':
                raise WebServiceException(service, 'call returned error status')
            result = [re.sub('-[^ ] ', '', e['text']) for e in data['keywords']]
            
        elif service == 'yahoo':
            data = json.loads(resp)
            result = data['ResultSet']['Result']

        elif service == 'wordsfinder':
            parsed_response = parseString(resp)
            e = parsed_response.getElementsByTagName('error')
            if e:
                raise WebServiceException(service, 'error code %s' % e.firstChild.datad)
            else:
                result = [node.firstChild.data for node in parsed_response.getElementsByTagName('keyword')]
        elif service == 'tagthe':
            data = json.loads(resp)
            if 'memes' in data and 'dimensions' in data['memes'][0] and 'topic' in data['memes'][0]['dimensions']:
                result = data['memes'][0]['dimensions']['topic']
                #logging.debug(u'tagthe result %s' %result)
            else:
                result = ['', ]
            

        return [unescape(w) for w in result]


    # TODO: maybe find a way to call 'em all and have a super-set of kws?
    else:
        return ''
示例#24
0
def get_entities(content):
    API_KEY = os.environ['API_KEY']
    calais = Calais(API_KEY, submitter="python-calais demo")
    result = calais.analyze(content)
    result.print_entities()
示例#25
0
                        n = sib

            #PERP - INDIVIDUALS
            if 'ENTITIES/INDIVIDUALS INVOLVED' in soup.find_all('h2')[i]:
                next = True
                n = soup.find_all('h2')[i]
                perp_indivs = []
                perp_rank = []
                while next == True:
                    sib = n.next_sibling
                    if sib.name == 'h2':
                        next = False
                    elif sib.name != 'br' and sib != '\n' not in sib:

                        #test to see if it's a person using calais
                        r = calais.analyze(sib.encode('utf-8').strip())
                        if hasattr(r, 'entities'):
                            for j in range(len(r.entities)):
                                if r.entities[j]['_type'] == 'Person':
                                    perp_indivs.append(r.entities[j]['name'])
                                if r.entities[j]['_type'] == 'Position':
                                    position = r.entities[j]['name']

                                    #PERP RANK
                                    fit = False
                                    if 'President' in position or 'Chairman' in position or 'CEO' in position or 'Chief Executive Officer' in position:
                                        if 'CEO/Chairman/President' not in perp_rank:
                                            perp_rank.append(
                                                'CEO/Chairman/President')
                                            fit = True
                                    if 'Founder' in position:
示例#26
0
      _topic = "None"
      _score = 0

   '''
   rate limit issues
   Submission Rate (per second), 4, 
   '''
   time.sleep( 0.3 )

   '''
   classifier 2
   '''
   #opencalais crosscheck
   print '================'
   if real_url_hash in ('e3cb4b6333506ffe5d746c9fabe32a00f9513f46', '76a134d8cb993198d89da02889592d3984a05861', 'c55fc7c88b1d9c0a58a1f29e44f9e0a69f1cc3fa'):
      result = calais.analyze("None")
   else:
      result = calais.analyze(analysetext)
   topic =  result.get_topics()
   score =  result.get_topic_score()
   if topic:
         print 'OC topic:' + topic
         print 'OC score:%f'% (score*100)
   else:
      topic = "None"
      score = 0
   print '================'
   jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) 
   print jsonOutput

   sqlupdate = "UPDATE condor.url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';"
示例#27
0
def classify(real_url_hash):
   
   API_KEY = "***REMOVED***"
   '''
   # Open database connection
   db = MySQLdb.connect(
       host = '127.0.0.1', 
       user = '******', 
       passwd = 'globelab', 
       db = 'condor', 
       port = 3307)
   '''
   try:
	with open('config-local.json') as fh:
		config = json.load(fh)
   except IOError:
	with open('config.json') as fh:
		config = json.load(fh)

   db = MySQLdb.connect(
	host=config['mysql']['host'],
	port=config['mysql']['port'],
 	user=config['mysql']['user'],
 	passwd=config['mysql']['password'],
 	db=config['mysql']['database'],
	use_unicode=True,
    	charset="utf8")
  
   db.autocommit(True)

   #call calais function
   calais = Calais(API_KEY, submitter="python-calais classify")

   #run from train set
   pkfile = open('savedclassifier.pickle','rb')
   classifier = pickle.load(pkfile)

   # prepare a cursor object using cursor() method
   cursor = db.cursor()

   # Prepare SQL query to INSERT a record into the database.
   sql = "SELECT embedly_blob,real_url_hash  FROM  url_info WHERE real_url_hash='" + real_url_hash + "' LIMIT 1;" 

   # Execute the SQL command
   cursor.execute(sql)

   # Fetch all the rows
   results = cursor.fetchall()

   #browse through results
   for row in results:

      #get results
      real_url_hash = row[1]
      jsondecode = json.loads(row[0])
      title = jsondecode['title']
      description = jsondecode['description']
      url = jsondecode['url']

      if title and description and url:
         analysetext =  title + ' ' + description + ' ' + url
      else:
         analysetext = ' '
         if title:
            analysetext = analysetext  + ' ' +  title
         if description:
            analysetext = analysetext  + ' ' +  description
         if url:
            analysetext = analysetext  + ' ' +  url
            
      analysetext.encode("utf8")
      analysetext = analysetext.encode("utf8")
      analysetext= analysetext.replace('"', '\'')
      
      
      #classifier 1
      #naive bayes
      _topic = classifier.classify(analysetext)
      _score = classifier.get_score(analysetext)
      
      #classifier 2
      #opencalais crosscheck   
      try:
         result = calais.analyze(analysetext)
         topic =  result.get_topics()
         score =  result.get_topic_score()
      except:
         topic = "None"
         score = 0.0
         sqlerr = 'INSERT INTO error_classify (real_url_hash, text) VALUES("' + real_url_hash+ '","' + analysetext + '");'
         cursor.execute(sqlerr)
         
      print real_url_hash
      print analysetext
      print topic
      print _topic
      
      #create json output      
      jsonOutput = "{\"topic\":\"%s\" , \"score\":\"%f\" , \"_topic\":\"%s\" , \"_score\":\"%f\"}" % (topic, score, _topic, _score) 
      sqlupdate = "UPDATE url_info SET topic_blob=\'" + jsonOutput + "\' WHERE real_url_hash=\'" + real_url_hash + "\';"
      x = cursor.execute(sqlupdate)
      trace =  'trace: updated url_info, url hash [%s]: %d' % (real_url_hash, x)

      #update score
      if topic == "Sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      elif topic == "None" and _topic == "sports":
         cursor.execute("UPDATE url_info SET sports_score=\'" + str(_score) + "\' WHERE real_url_hash=\'" + real_url_hash + "\';")
      else:
         cursor.execute("UPDATE url_info SET sports_score='0' WHERE real_url_hash=\'" + real_url_hash + "\';")

   db.close()
   pkfile.close()
    processed_tags = []
    for item in params['fields']:
        try:
            d = item.copy()
            
            field = d.pop('name')
            proc_type = d.pop('process_type', process_type)
            markup = d.pop('markup', False)

            data = getattr(obj, field)

            data = force_unicode(getattr(obj, field))
            
            # Analyze the text (data)
            result = c.analyze(data)

            # Retrieve the Django content type for the obj
            ctype = ContentType.objects.get_for_model(obj)
            # Remove existing items, this ensures tagged items are updated correctly
            SuperTaggedItem.objects.filter(content_type=ctype, object_id=obj.pk, field=field).delete()
            if settings.PROCESS_RELATIONS:
                SuperTaggedRelationItem.objects.filter(content_type=ctype, object_id=obj.pk, field=field).delete()

            entities, relations, topics = [], [], []
            # Process entities, relations and topics
            if hasattr(result, 'entities'):
                entities = _processEntities(field, result.entities, obj, ctype, proc_type, tags)

            if hasattr(result, 'relations') and settings.PROCESS_RELATIONS:
                relations = _processRelations(field, result.relations, obj, ctype, proc_type, tags)                
示例#29
0
    fout = open(("results/" + filename + ".html"), "w")
    fout.write('<html>')
    fout.write('\r\n')
    fout.write('<head><title>' + filename + '</title></head>')
    fout.write('\r\n')
    fout.write('<body>')

    with open(("articles/" + filename + ".txt"), "r") as myfile:
        sys.stdin = myfile
        content = ""
        for line in sys.stdin:
            content += line

        API_KEY = "f7vhuv2kt4fxufuvv6eznwpe"
        calais = Calais(API_KEY, submitter="python-calais newsparser")
        result = calais.analyze(content)

        print "Summary of the Calais Analysis"
        result.print_summary()

        print "Entity of the Calais Analysis"
        result.print_entities()

        i = 0
        temp = []
        entityList = []
        html = []
        for entity in result.entities:
            if result.entities[i]["_type"] in [
                    "City", "Company", "Country", "Movie", "Organization",
                    "Person"
示例#30
0
from calais import Calais

API_KEY = "k6s6cewwwc5zkemjqpw7yhru"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze("michelle obama")

print result.print_summary()
print result.print_topics()
print result.print_relations()

def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['(TOTAL)'] = 0
  country_count['united states'] = 0
  country_count['~origin unknown'] = 0
  count = 0
  

  for pair in email_addr:
    check = 0
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
        # if the addr contains country information
        if hasattr(response, 'entities'):
          # entry is a list of 3 elements: priority (3 for ProvinceOrState, 2 for Country, 1 for EmailAddress ), Country, Province 
          entry = [-1, '', '']
          for each in response.entities:
            if each['_type'] == 'ProvinceOrState':
              
              try: 
                entry[1] = each['resolutions'][0]['containedbycountry'].lower()
                entry[0] = 3
                entry[2] = each['name'].lower()
              except KeyError:
                print 'Country name cannot be retrieved'

            elif each['_type'] == 'Country':
              if entry[0] < 2:
                entry[0] = 2
                entry[1] = each['name'].lower()
            elif each['_type'] == 'EmailAddress':
              if entry[0] < 1:
                entry[0] = 1

          if entry[0] == 3:
            name = '(US) - ' + entry[2]
            if entry[1] not in country_count:
              country_count[entry[1]] = 1
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]
              if entry[1] == 'united states':
                if name not in country_count:
                  country_count[name] = 1 
                else:
                  country_count[name] = 1 + country_count[name] 

          elif entry[0] == 2:
            if entry[1] not in country_count:
              country_count[entry[1]] = 1   
            else:
              country_count[entry[1]] = 1 + country_count[entry[1]]

          elif entry[0] == 1:
            check = 1 # go through email check

          else:
            country_count['~origin unknown'] = country_count['~origin unknown'] + 1

      else: 
        check = 1

      # if addr is empty, query email address mapping table; if no entry, Unknown add 1
      # here we assume that all entries without addr and without strong indication of country origins in their emails will be categorized under the USA entry
      if check == 1: 
        # determine entry name
        email_endstr = pair[0].split('.')[-1].lower()
        if email_endstr in country_code:
          name = country_code[email_endstr]
        else:
          name = '~origin unknown'  
        # add entry 
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1

    except ValueError:
      print 'Calais could not handle the language'
      country_count['~origin unknown'] = country_count['~origin unknown'] + 1
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  country_count['(TOTAL)'] = count
  country = sorted(country_count)

  print country
  us = 0
  with open('origin.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country:
      if key != 'united states':
        a.writerow([key, country_count[key]])
      if us == 0:
        a.writerow(['united states', country_count['united states']])
        us = 1
def main():
  # read in csv file to extract the email and addresses field
  # put email and addr into a list of tuples 
  email_addr = []
  with open('OECx_PH241x_3T2014_student_profile_info_2014-10-20-1645.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          pair = [row[3], row[9]]
          email_addr.append(pair)

  # create dictionary to find the country code | to iterate over the dictionary=> for key in d:
  country_code = {}
  with open('Country_code.csv', 'rU') as f:
      reader = csv.reader(f)
      for row in reader:
          key = row[0].split(' ', 1)[0].lower()
          value = row[0].split(' ', 1)[1].lower()
          country_code[key] = value

  # make Calais calls to extract country name
  api_key = 'wukyjrm778py5wry9qdtgk9u'
  calais = Calais(api_key)

  # dictionary to store all the results
  country_count = {}
  country_count['united states'] = 0
  count = 0
  
  for pair in email_addr:
    try:
      response = {}
      if pair[1] != '':
        response = calais.analyze(pair[1])
      # if the addr contains country information
      name = ''
      if hasattr(response, 'entities'):
        print response.entities
        name = response.entities[0]['containedbycountry'].lower()
        if '@' in name: #where email addresses are wrongly entered as addr
          last_str = pair[0].split('.')[-1].lower()
          if last_str in country_code:
            name = country_code[last_str]
          else:
            name = 'united states'
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
        else:
          if name not in country_count:
            country_count[name] = 1
          else:
            country_count[name] = country_count[name]+1
      # otherwise, check the email addr    
      else:
        last_str = pair[0].split('.')[-1].lower()
        if last_str in country_code:
          name = country_code[last_str]
        else:
          name = 'united states'
        if name not in country_count:
          country_count[name] = 1
        else:
          country_count[name] = country_count[name]+1
    except ValueError:
      print 'Calais could not handle the language'
    count = count +1
    print 'Number of entries queried: ' + str(count)
  
  print country_count

  with open('countrybreakdown.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for key in country_count:
      a.writerow([key, country_count[key]])
示例#33
0
import io,sys
from calais import Calais
API_KEY = "rg72c9882ypfjm24tvrfu6ab"
calais = Calais(API_KEY, submitter="python-calais demo")

##read text file
with open('file3.txt', 'r') as content_file:
    content = content_file.read()

##perform analysis on text
result = calais.analyze(content)
result.print_entities()

html_file = open("HTMLFile3.html", "w")

##the resulting entities obtained are not sorted, so we sort them here:
def comp(x,y):
    return y['instances'][0]['offset'] - x['instances'][0]['offset']
sorted_results = sorted (result.entities,cmp=comp)

b = content.decode("utf-8")

for i in range (len(sorted_results)):   #for each entity
    offset = sorted_results[i]['instances'][0]['offset']    #find offset
    length = sorted_results[i]['instances'][0]['length']    #find length
    total = offset + length                                 #find total length

    if offset != sorted_results[i-1]['instances'][0]['offset']: #to prevent same words being linked twice
        if 'resolutions' in sorted_results[i]:  #if rdf document exists
            link = sorted_results[i]['resolutions'][0]['id']
            data = "<a href = \"" + link + "\" target=\"_blank\">" + sorted_results[i]['name'] + "</a>"
from calais import Calais
API_KEY = "djgq52vv8uufzykmnb9g7myv"
calais = Calais(API_KEY, submitter ="python-calais demo")
result = calais.analyze('''Microsoft is a big company. George Bush was the President of the United States
of America until 2009.  Barack Obama is the new President of the United States now.''')
result2 = calais.analyze('''Microsoft is a big company''')
result3 = calais.analyze('''Troubled drug development company SFBC International said on Monday it has changed its name to PharmaNet Development Group Inc. and will be traded on Nasdaq under the stock symbol "PDGI".''')
d={}
a=[]
for i in result3.entities:
    if i["_type"]=="Technology":
        a.append(i["name"])
        d["Technology"]=a;
        print d

from calais import Calais
import os

API_KEY = "v5q6rvm7h4uww6sumjxuw9t7"
calais = Calais(API_KEY, submitter="python-calais demo")

f = open("Text/test.txt", 'r+')
NYT = f.read()
f.close()

result = calais.analyze(NYT)

result.print_summary()
result.print_entities()
示例#36
0
from calais import Calais
API_KEY = "dkm645ejqmq7aajt8cp6zxk7"
calais = Calais(API_KEY, submitter="python-calais demo")
result = calais.analyze(
    "My 15 year old Daughter has sores in her genital area and her mouth.She swares she did nothing but kiss her boyfriend.She also has flu-like symptoms."
)
result.print_summary()
result.print_entities()
result.print_topics()
p = raw_input()