class StanfordCoreferenceResolver(CoreferenceResolver): def __init__(self, start_server=True, endpoint=CoreNLPClient.DEFAULT_ENDPOINT): self.__client = CoreNLPClient(start_server=start_server, endpoint=endpoint, annotators=[ 'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'coref' ], output_format='json') self.__client.start() def __del__(self): self.__client.stop() def resolve_coreferences(self, text, entities): annotations = self.__client.annotate(text) entity_mention_indices = [] for chain in annotations.corefChain: mention_indices = [] for mention in chain.mention: sentence = annotations.sentence[mention.sentenceIndex] token_start = sentence.token[mention.beginIndex] token_end = sentence.token[mention.endIndex - 1] char_start = token_start.beginChar char_end = token_end.endChar mention_indices.append((char_start, char_end)) entity_mention_indices.append(mention_indices) entity_sets = [list() for _ in range(len(entity_mention_indices))] for entity in entities: is_coreferred = False for i, mention_indices in enumerate(entity_mention_indices): for start_index, end_index in mention_indices: if entity.start_offset >= start_index and entity.end_offset <= end_index: entity_sets[i].append(entity) is_coreferred = True if not is_coreferred: entity_sets.append([entity]) return entity_sets
!unzip corenlp.zip !mv ./stanford-corenlp-full-2018-10-05 ./corenlp # Set the CORENLP_HOME environment variable to point to the installation location import os os.environ["CORENLP_HOME"] = "./corenlp" from stanfordnlp.server import CoreNLPClient # Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001 client = CoreNLPClient(annotators=['tokenize','ssplit', 'pos', 'lemma', 'ner'], memory='4G', endpoint='http://localhost:9001') print(client) # Start the background server and wait for some time # Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed client.start() import time; time.sleep(10) !ps -o pid,cmd | grep java from google.colab import drive drive.mount('/content/gdrive') with open('/content/gdrive/My Drive/Colab Notebooks/chapter1.txt', 'r') as file: data = file.read().replace('\n', '') #data = "Such were some of various omens. Emperor Ling, greatly moved by these signs of the displeasure of Heaven, issued an edict asking his ministers for an explanation of the calamities and marvels." document = client.annotate(data) print(type(document))
class CoreNlp(ComponentBase): def __init__(self, config, config_global, logger): super(CoreNlp, self).__init__(config, config_global, logger) self.cache = self._provide_cache("stanfordnlp_cache", human_readable=False) corenlp_home = config.get("corenlp_home", None) if corenlp_home: # resolve corenlp_home against the shell's working dir os.environ["CORENLP_HOME"] = str(Path.cwd() / Path(corenlp_home)) self._kwargs = config.pop("corenlp_kwargs", {"annotators": "depparse"}) self._client = None # type: Optional[CoreNLPClient] def parse_sentence(self, sentence: str, properties: Optional[Dict] = None): """ Run CoreNLP over a sentence. :param sentence: a single sentence :param properties: additional properties for CoreNLP :return: parsing result """ # The same input sentence can result in different annotations depending on the CoreNLP properties specified. # We therefore use a cache identifier for the sentence which includes the annotation properties. sent_cache_identifier = get_dict_hash( { "sentence": sentence, "properties": properties }, shorten=False) if not sent_cache_identifier in self.cache: # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in # a convenient format. A convenient format is the default format (protobuf-based), but that's not # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that. # When reading from the cache, we reassemble the protobuf object. req_properties = {"outputFormat": "serialized"} if properties is not None: req_properties.update(properties) doc = self.client.annotate(sentence, properties=req_properties) stream = writeToDelimitedString(doc) buf = stream.getvalue() stream.close() self.cache[sent_cache_identifier] = buf else: buf = self.cache[sent_cache_identifier] doc = Document() parseFromDelimitedString(doc, buf) return doc @property def client(self): if self._client is None: self._client = CoreNLPClient(**self._kwargs) self._client.start() return self._client @overrides def clean_up(self): if self._client is not None: self._client.stop()