def indico_batch_ner(): """another ONE-OFF method to call the indico.io API to batch NER 18192 texts """ with open('sentiments.csv', 'wb') as f: texts = [] writer = csv.writer(f) with open('texts/filenames.txt', 'r') as filenames: fn_list = map(str.strip, [filename for filename in filenames]) fn_list = map(lambda x: 'texts/texts/' + x, fn_list) for fn in fn_list: texts.append(get_texts(fn)) # returns TextMessage object texts = [item for sublist in texts for item in sublist] with open('indico_ner_errors.txt', 'w') as error_log: for text in texts: sentiment_result = None try: sentiment_result = named_entities(text.body.encode(), api_key=INDICO_API_KEY) except BaseException as e: error_log.write(str(e)) finally: writer.writerow([ unicode(s).encode('utf-8') for s in [ text.msg_id, text.posix, repr(text.sent), text.body, repr(text.mentions), sentiment_result ] ])
def analysis(data): sentiment = ind.sentiment_hq(data) tags = sort(ind.text_tags(data)) languages = sort(ind.language(data)) politics = sort(ind.political(data)) keywords = sort(ind.keywords(data)) names = sort(ind.named_entities(data)) print "Sentiment", sentiment print "\n\n\nTags" for t in tags: print t[0], float(t[1]) * 100 print "\n\n\nLanguages" for l in languages: print l[0], float(l[1]) * 100 print "\n\n\nPolitical" for p in politics: print p[0], float(p[1]) * 100 print "\n\nkeywords" for k in keywords: print k[0], float(k[1]) * 100
def test_named_entities(self): text = "London Underground's boss Mike Brown warned that the strike ..." expected_entities = ("London Underground", "Mike Brown") expected_keys = set(["categories", "confidence"]) entities = named_entities(text) for entity in expected_entities: assert entity in expected_entities assert not (set(entities[entity]) - expected_keys)
def entityMatch(message): ne = indicoio.named_entities(message) for e in ne.keys(): if ne[e]["confidence"] > .8: if ne[e]["categories"]["organization"] > .5: if e in entities: return e return "None"
def score(self, slide_length, window_length, AItype='tags'): self.parse(slide_length, window_length) if AItype == 'tags': self.scores['tags'] = [indicoio.text_tags(i) for i in self.strings] elif AItype == 'keywords': self.scores['keywords'] = [indicoio.keywords(i) for i in self.strings] elif AItype == 'names': self.scores['names'] = [indicoio.named_entities(i) for i in self.strings] else: raise Exception('Warning: {} not a valid category'.format(category))
def indico_batch_ner(): """another ONE-OFF method to call the indico.io API to batch NER 18192 texts """ with open('sentiments.csv', 'wb') as f: texts = [] writer = csv.writer(f) with open('texts/filenames.txt', 'r') as filenames: fn_list = map(str.strip, [filename for filename in filenames]) fn_list = map(lambda x: 'texts/texts/' + x, fn_list) for fn in fn_list: texts.append(get_texts(fn)) # returns TextMessage object texts = [item for sublist in texts for item in sublist] with open('indico_ner_errors.txt', 'w') as error_log: for text in texts: sentiment_result = None try: sentiment_result = named_entities(text.body.encode(), api_key=INDICO_API_KEY) except BaseException as e: error_log.write(str(e)) finally: writer.writerow([unicode(s).encode('utf-8') for s in [text.msg_id, text.posix, repr(text.sent), text.body, repr(text.mentions), sentiment_result]])
def entity_extraction(self, text): self.named_entities = named_entities(text, threshold=0)
#fd = open("BBC.txt", "r") #string1 += fd.read() string1 = string1.replace("\xe2\x80\x9c", "\"") string1 = string1.replace("\xe2\x80\x9d", "\"") string1 = string1.replace("\xe2\x80\x99", "\'") keywordList = [] tagList = [] entityList = [] myList = string1.split("\n", size) for x in range(0, size): keywordList.append(indicoio.keywords(myList[x], top_n=10, independent=True)) tagList.append(indicoio.text_tags(myList[x], threshold=.05)) entityList.append(indicoio.named_entities(myList[x])) #print indicoio.text_tags(myList[x], threshold=.1) #print indicoio.keywords(myList[x], top_n=6, independent=True) ## build 2-d array of weights matrix = [[0 for x in range(size)] for x in range(size)] for x in range(0, size): for y in range(0, size): matrix[x][y] = 1000 * compareKeywords( keywordList[x], keywordList[y]) * compareTags( tagList[x], tagList[y]) * compareEntities( entityList[x], entityList[y]) #print str(x) + " " + str(y) + " " + str(matrix[x][y])
def findNames(inputString): return indicoio.named_entities(inputString)
def fetch_named_entities(self): properNouns = indicoio.named_entities(self.data) print("\nproperNouns: ", properNouns) nounDict = {k: v['confidence'] for k, v in properNouns.items()} self.final_json['keywords'].update( sorted(nounDict, key=nounDict.get, reverse=True)[:5])
def get_entities(): if request.method == 'POST': data = dict(request.form)['data_to_analyze'] return json.dumps({ 'keywords': indicoio.named_entities(data) })
def fetch_named_entities(self): properNouns = indicoio.named_entities(self.data) print("\nproperNouns: ", properNouns) nounDict = {k:v['confidence'] for k,v in properNouns.items()} self.final_json['keywords'].update(sorted(nounDict, key=nounDict.get, reverse=True)[:5])