class StanfordCoreferenceResolver(CoreferenceResolver):
    def __init__(self,
                 start_server=True,
                 endpoint=CoreNLPClient.DEFAULT_ENDPOINT):
        self.__client = CoreNLPClient(start_server=start_server,
                                      endpoint=endpoint,
                                      annotators=[
                                          'tokenize', 'ssplit', 'pos', 'lemma',
                                          'ner', 'parse', 'coref'
                                      ],
                                      output_format='json')
        self.__client.start()

    def __del__(self):
        self.__client.stop()

    def resolve_coreferences(self, text, entities):
        annotations = self.__client.annotate(text)

        entity_mention_indices = []
        for chain in annotations.corefChain:
            mention_indices = []
            for mention in chain.mention:
                sentence = annotations.sentence[mention.sentenceIndex]
                token_start = sentence.token[mention.beginIndex]
                token_end = sentence.token[mention.endIndex - 1]
                char_start = token_start.beginChar
                char_end = token_end.endChar
                mention_indices.append((char_start, char_end))
            entity_mention_indices.append(mention_indices)

        entity_sets = [list() for _ in range(len(entity_mention_indices))]
        for entity in entities:
            is_coreferred = False
            for i, mention_indices in enumerate(entity_mention_indices):
                for start_index, end_index in mention_indices:
                    if entity.start_offset >= start_index and entity.end_offset <= end_index:
                        entity_sets[i].append(entity)
                        is_coreferred = True
            if not is_coreferred:
                entity_sets.append([entity])
        return entity_sets
!unzip corenlp.zip
!mv ./stanford-corenlp-full-2018-10-05 ./corenlp

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = "./corenlp"

from stanfordnlp.server import CoreNLPClient

# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], memory='4G', endpoint='http://localhost:9001')
print(client)

# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

!ps -o pid,cmd | grep java

from google.colab import drive
drive.mount('/content/gdrive')

with open('/content/gdrive/My Drive/Colab Notebooks/chapter1.txt', 'r') as file:
  data = file.read().replace('\n', '')

#data = "Such were some of various omens. Emperor Ling, greatly moved by these signs of the displeasure of Heaven, issued an edict asking his ministers for an explanation of the calamities and marvels."

document = client.annotate(data)
print(type(document))
Пример #3
0
class CoreNlp(ComponentBase):
    def __init__(self, config, config_global, logger):
        super(CoreNlp, self).__init__(config, config_global, logger)

        self.cache = self._provide_cache("stanfordnlp_cache",
                                         human_readable=False)

        corenlp_home = config.get("corenlp_home", None)
        if corenlp_home:
            # resolve corenlp_home against the shell's working dir
            os.environ["CORENLP_HOME"] = str(Path.cwd() / Path(corenlp_home))

        self._kwargs = config.pop("corenlp_kwargs", {"annotators": "depparse"})
        self._client = None  # type: Optional[CoreNLPClient]

    def parse_sentence(self, sentence: str, properties: Optional[Dict] = None):
        """
        Run CoreNLP over a sentence.
        :param sentence: a single sentence
        :param properties: additional properties for CoreNLP
        :return: parsing result
        """
        # The same input sentence can result in different annotations depending on the CoreNLP properties specified.
        # We therefore use a cache identifier for the sentence which includes the annotation properties.
        sent_cache_identifier = get_dict_hash(
            {
                "sentence": sentence,
                "properties": properties
            }, shorten=False)

        if not sent_cache_identifier in self.cache:
            # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in
            # a convenient format. A convenient format is the default format (protobuf-based), but that's not
            # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that.
            # When reading from the cache, we reassemble the protobuf object.
            req_properties = {"outputFormat": "serialized"}
            if properties is not None:
                req_properties.update(properties)
            doc = self.client.annotate(sentence, properties=req_properties)
            stream = writeToDelimitedString(doc)
            buf = stream.getvalue()
            stream.close()
            self.cache[sent_cache_identifier] = buf
        else:
            buf = self.cache[sent_cache_identifier]
            doc = Document()
            parseFromDelimitedString(doc, buf)

        return doc

    @property
    def client(self):
        if self._client is None:
            self._client = CoreNLPClient(**self._kwargs)
            self._client.start()
        return self._client

    @overrides
    def clean_up(self):
        if self._client is not None:
            self._client.stop()