예제 #1
0
    def extract_entity_frequencies(self, tokens):
        pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
        loc = LocationChunker()
        
        trees = batch_ne_chunk(pos_tagged_tokens)
        entity_types = ['PERSON', 'ORGANIZATION', 'GPE', 'LOCATION', 'FACILITY']
        for entity_type in entity_types:
            entity_freq_dict = {}
            chunks = [sub_leaves(t, entity_type) for t in trees]
            for sent in chunks:
                for c in sent:
                    entity = ' '.join([w[0] for w in c])
                    entity_freq_dict[entity] = entity_freq_dict.get(entity, 0) + 1

            # A secondary attempt at extracting locations based on reference
            # to lists of place names
            if entity_type == 'LOCATION':
                for sent in pos_tagged_tokens:
                    t = loc.parse(sent)
                    chunks = sub_leaves(t, 'LOCATION')
                    for c in chunks:
                        entity = ' '.join([w[0] for w in c])
                        entity_freq_dict[entity] = entity_freq_dict.get(entity, 0) + 1

            entity_freq_list = [(entity_freq_dict[e], e) for e in entity_freq_dict.keys()]
            entity_freq_list.sort(reverse=True)
            for e in entity_freq_list:
                self.entity_frequencies.append(
                    entity_type=entity_type.lower(),
                    entity=e[1],
                    frequency=e[0])
예제 #2
0
파일: __init__.py 프로젝트: keho98/argos
def entities(docs, strategy='stanford'):
    """
    Named entity recognition on
    a text document or documents.

    Requires that a Stanford NER server is
    running on localhost:8080.

    Args:
        | docs (list)       -- the documents to process.
        | doc (str)         -- the document to process.
        | strategy (str)    -- the strategy to use, default is stanford.

    Returns:
        | list              -- list of all entity mentions
    """
    if type(docs) is str:
        docs = [docs]

    entities = []

    if strategy == 'stanford':
        tagger = ner.SocketNER(host='localhost', port=8080)

        for doc in docs:
            ents = tagger.get_entities(doc)
            # We're only interested in the entity names,
            # not their tags.
            names = [ents[key] for key in ents]

            # Flatten the list of lists.
            names = [name for sublist in names for name in sublist]

            entities += names

        # TEMPORARILY REMOVED, THIS PART IS HANDLED EXTERNALLY BY A VECTORIZER.
        # Calculate (rough, naive) normalized weights for the entities.
        # Will likely want to find ways to recognize congruent entities which
        # may not necessarily be consistently mentioned, i.e. "Bill Clinton" and "Clinton" (not yet implemented).
        #counts = Counter(entities)
        #if len(counts):
        #top_count = counts.most_common(1)[0][1]
        #results = []
        #for entity, count in counts.items():
        #results.append((entity, count/top_count))
        #return results
        return entities

    elif strategy == 'nltk':
        from nltk.tag import pos_tag
        from nltk.chunk import batch_ne_chunk
        for doc in docs:
            sentences = sent_tokenize(doc)
            tokenized_sentences = [word_tokenize(sent) for sent in sentences]
            tagged = [pos_tag(sent) for sent in tokenized_sentences]
            chunked = batch_ne_chunk(
                tagged, binary=True
            )  # binary=False will tag entities as ORGANIZATION, etc.

            for tree in chunked:
                entities.extend(_extract_entities(tree))
            return entities

    else:
        raise Exception('Unknown strategy specified.')
예제 #3
0
파일: __init__.py 프로젝트: keho98/argos
def entities(docs, strategy='stanford'):
    """
    Named entity recognition on
    a text document or documents.

    Requires that a Stanford NER server is
    running on localhost:8080.

    Args:
        | docs (list)       -- the documents to process.
        | doc (str)         -- the document to process.
        | strategy (str)    -- the strategy to use, default is stanford.

    Returns:
        | list              -- list of all entity mentions
    """
    if type(docs) is str:
        docs = [docs]

    entities = []

    if strategy == 'stanford':
        tagger = ner.SocketNER(host='localhost', port=8080)

        for doc in docs:
            ents = tagger.get_entities(doc)
            # We're only interested in the entity names,
            # not their tags.
            names = [ents[key] for key in ents]

            # Flatten the list of lists.
            names = [name for sublist in names for name in sublist]

            entities += names

        # TEMPORARILY REMOVED, THIS PART IS HANDLED EXTERNALLY BY A VECTORIZER.
        # Calculate (rough, naive) normalized weights for the entities.
        # Will likely want to find ways to recognize congruent entities which
        # may not necessarily be consistently mentioned, i.e. "Bill Clinton" and "Clinton" (not yet implemented).
        #counts = Counter(entities)
        #if len(counts):
            #top_count = counts.most_common(1)[0][1]
        #results = []
        #for entity, count in counts.items():
            #results.append((entity, count/top_count))
        #return results
        return entities

    elif strategy == 'nltk':
        from nltk.tag import pos_tag
        from nltk.chunk import batch_ne_chunk
        for doc in docs:
            sentences = sent_tokenize(doc)
            tokenized_sentences = [word_tokenize(sent) for sent in sentences]
            tagged = [pos_tag(sent) for sent in tokenized_sentences]
            chunked = batch_ne_chunk(tagged, binary=True) # binary=False will tag entities as ORGANIZATION, etc.

            for tree in chunked:
                entities.extend(_extract_entities(tree))
            return entities

    else:
        raise Exception('Unknown strategy specified.')