def extract_entity_frequencies(self, tokens): pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] loc = LocationChunker() trees = batch_ne_chunk(pos_tagged_tokens) entity_types = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY'] for entity_type in entity_types: entity_freq_dict = {} chunks = [sub_leaves(t, entity_type) for t in trees] for sent in chunks: for c in sent: entity = ' '.join([w[0] for w in c]) entity_freq_dict[entity] = entity_freq_dict.get(entity, 0) + 1 # A secondary attempt at extracting locations based on reference # to lists of place names if entity_type == 'LOCATION': for sent in pos_tagged_tokens: t = loc.parse(sent) chunks = sub_leaves(t, 'LOCATION') for c in chunks: entity = ' '.join([w[0] for w in c]) entity_freq_dict[entity] = entity_freq_dict.get(entity, 0) + 1 entity_freq_list = [(entity_freq_dict[e], e) for e in entity_freq_dict.keys()] entity_freq_list.sort(reverse=True) for e in entity_freq_list: self.entity_frequencies.append( entity_type=entity_type.lower(), entity=e[1], frequency=e[0])
def entities(docs, strategy='stanford'): """ Named entity recognition on a text document or documents. Requires that a Stanford NER server is running on localhost:8080. Args: | docs (list) -- the documents to process. | doc (str) -- the document to process. | strategy (str) -- the strategy to use, default is stanford. Returns: | list -- list of all entity mentions """ if type(docs) is str: docs = [docs] entities = [] if strategy == 'stanford': tagger = ner.SocketNER(host='localhost', port=8080) for doc in docs: ents = tagger.get_entities(doc) # We're only interested in the entity names, # not their tags. names = [ents[key] for key in ents] # Flatten the list of lists. names = [name for sublist in names for name in sublist] entities += names # TEMPORARILY REMOVED, THIS PART IS HANDLED EXTERNALLY BY A VECTORIZER. # Calculate (rough, naive) normalized weights for the entities. # Will likely want to find ways to recognize congruent entities which # may not necessarily be consistently mentioned, i.e. "Bill Clinton" and "Clinton" (not yet implemented). #counts = Counter(entities) #if len(counts): #top_count = counts.most_common(1)[0][1] #results = [] #for entity, count in counts.items(): #results.append((entity, count/top_count)) #return results return entities elif strategy == 'nltk': from nltk.tag import pos_tag from nltk.chunk import batch_ne_chunk for doc in docs: sentences = sent_tokenize(doc) tokenized_sentences = [word_tokenize(sent) for sent in sentences] tagged = [pos_tag(sent) for sent in tokenized_sentences] chunked = batch_ne_chunk( tagged, binary=True ) # binary=False will tag entities as ORGANIZATION, etc. for tree in chunked: entities.extend(_extract_entities(tree)) return entities else: raise Exception('Unknown strategy specified.')
def entities(docs, strategy='stanford'): """ Named entity recognition on a text document or documents. Requires that a Stanford NER server is running on localhost:8080. Args: | docs (list) -- the documents to process. | doc (str) -- the document to process. | strategy (str) -- the strategy to use, default is stanford. Returns: | list -- list of all entity mentions """ if type(docs) is str: docs = [docs] entities = [] if strategy == 'stanford': tagger = ner.SocketNER(host='localhost', port=8080) for doc in docs: ents = tagger.get_entities(doc) # We're only interested in the entity names, # not their tags. names = [ents[key] for key in ents] # Flatten the list of lists. names = [name for sublist in names for name in sublist] entities += names # TEMPORARILY REMOVED, THIS PART IS HANDLED EXTERNALLY BY A VECTORIZER. # Calculate (rough, naive) normalized weights for the entities. # Will likely want to find ways to recognize congruent entities which # may not necessarily be consistently mentioned, i.e. "Bill Clinton" and "Clinton" (not yet implemented). #counts = Counter(entities) #if len(counts): #top_count = counts.most_common(1)[0][1] #results = [] #for entity, count in counts.items(): #results.append((entity, count/top_count)) #return results return entities elif strategy == 'nltk': from nltk.tag import pos_tag from nltk.chunk import batch_ne_chunk for doc in docs: sentences = sent_tokenize(doc) tokenized_sentences = [word_tokenize(sent) for sent in sentences] tagged = [pos_tag(sent) for sent in tokenized_sentences] chunked = batch_ne_chunk(tagged, binary=True) # binary=False will tag entities as ORGANIZATION, etc. for tree in chunked: entities.extend(_extract_entities(tree)) return entities else: raise Exception('Unknown strategy specified.')