def do_POST(self): """ Handle an annotate request """ if not self.path.endswith("/"): self.path += "/" if self.path == "/annotate/": # Read message length = int(self.headers.get('content-length')) msg = self.rfile.read(length) # Do the annotation doc = Document() parseFromDelimitedString(doc, msg) self.annotator.annotate(doc) with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() # write message self.send_response(HTTPStatus.OK) self.send_header("Content-Type", "application/x-protobuf") self.send_header("Content-Length", len(msg)) self.end_headers() self.wfile.write(msg) else: self.send_response(HTTPStatus.BAD_REQUEST) self.end_headers()
def parse_sentence(self, sentence: str, properties: Optional[Dict] = None): """ Run CoreNLP over a sentence. :param sentence: a single sentence :param properties: additional properties for CoreNLP :return: parsing result """ # The same input sentence can result in different annotations depending on the CoreNLP properties specified. # We therefore use a cache identifier for the sentence which includes the annotation properties. sent_cache_identifier = get_dict_hash( { "sentence": sentence, "properties": properties }, shorten=False) if not sent_cache_identifier in self.cache: # Kludge ahead: We want to cache the parsed sentence provided by CoreNLP, but also want to work with it in # a convenient format. A convenient format is the default format (protobuf-based), but that's not # pickle-able for the cache. We therefore convert the protobuf-format back into a bytestring and cache that. # When reading from the cache, we reassemble the protobuf object. req_properties = {"outputFormat": "serialized"} if properties is not None: req_properties.update(properties) doc = self.client.annotate(sentence, properties=req_properties) stream = writeToDelimitedString(doc) buf = stream.getvalue() stream.close() self.cache[sent_cache_identifier] = buf else: buf = self.cache[sent_cache_identifier] doc = Document() parseFromDelimitedString(doc, buf) return doc
def test_write_protobuf(doc_pb): stream = writeToDelimitedString(doc_pb) buf = stream.getvalue() stream.close() doc_pb_ = Document() parseFromDelimitedString(doc_pb_, buf) assert doc_pb == doc_pb_
def doc_pb(): test_dir = os.path.dirname(os.path.abspath(__file__)) test_data = os.path.join(test_dir, 'data', 'test.dat') with open(test_data, 'rb') as f: buf = f.read() doc = Document() parseFromDelimitedString(doc, buf) return doc
def annotate(self, text, annotators=None, output_format=None, properties=None): """Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) annotators: list of annotators to use :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml :param (dict) properties: properties that the server expects :return: request result """ # set properties for server call if properties is None: properties = self.default_properties properties.update({ 'annotators': ','.join(annotators or self.default_annotators), 'inputFormat': 'text', 'outputFormat': self.default_output_format, 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) elif "annotators" not in properties: properties.update({ 'annotators': ','.join(annotators or self.default_annotators) }) # if an output_format is specified, use that to override if output_format is not None: properties["outputFormat"] = output_format # make the request r = self._request(text.encode('utf-8'), properties) # customize what is returned based outputFormat if properties["outputFormat"] == "serialized": doc = Document() parseFromDelimitedString(doc, r.content) return doc elif properties["outputFormat"] == "json": return r.json() elif properties["outputFormat"] in ["text", "conllu", "conll", "xml"]: return r.text else: return r
def update(self, doc, annotators=None, properties=None): if properties is None: properties = self.default_properties properties.update({ 'annotators': ','.join(annotators or self.default_annotators), 'inputFormat': 'serialized', 'outputFormat': 'serialized', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() r = self._request(msg, properties) doc = Document() parseFromDelimitedString(doc, r.content) return doc
def update(self, doc, annotators=None, properties=None): if properties is None: properties = {} properties.update({ 'inputFormat': 'serialized', 'outputFormat': 'serialized', 'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer' }) if annotators: properties['annotators'] = ",".join(annotators) if isinstance( annotators, list) else annotators with io.BytesIO() as stream: writeToDelimitedString(doc, stream) msg = stream.getvalue() r = self._request(msg, properties) doc = Document() parseFromDelimitedString(doc, r.content) return doc
def annotate(self, text, annotators=None, output_format=None, properties_key=None, properties=None, **kwargs): """ Send a request to the CoreNLP server. :param (str | unicode) text: raw text for the CoreNLPServer to parse :param (list | string) annotators: list of annotators to use :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml :param (str) properties_key: key into properties cache for the client :param (dict) properties: additional request properties (written on top of defaults) The properties for a request are written in this order: 1. Server default properties (server side) 2. Properties from client's properties_cache corresponding to properties_key (client side) If the properties_key is the name of a Stanford CoreNLP supported language: [Arabic, Chinese, English, French, German, Spanish], the Stanford CoreNLP defaults will be used (server side) 3. Additional properties corresponding to properties (client side) 4. Special case specific properties: annotators, output_format (client side) :return: request result """ # set properties for server call # first look for a cached default properties set # if a Stanford CoreNLP supported language is specified, just pass {pipelineLanguage="french"} if properties_key is not None: if properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES: request_properties = { 'pipelineLanguage': properties_key.lower() } else: request_properties = self.properties_cache.get( properties_key, {}) else: request_properties = {} # add on custom properties for this request if properties is None: properties = {} request_properties.update(properties) # if annotators list is specified, override with that if annotators is not None: request_properties['annotators'] = ",".join( annotators) if isinstance(annotators, list) else annotators # always send an output format with request # in some scenario's the server's default output format is unknown, so default to serialized if output_format is not None: request_properties['outputFormat'] = output_format if request_properties.get('outputFormat') is None: if self.server_start_info.get('props', {}).get('outputFormat'): request_properties['outputFormat'] = self.server_start_info[ 'props']['outputFormat'] else: request_properties[ 'outputFormat'] = CoreNLPClient.DEFAULT_OUTPUT_FORMAT # make the request r = self._request(text.encode('utf-8'), request_properties, **kwargs) if request_properties["outputFormat"] == "json": return r.json() elif request_properties["outputFormat"] == "serialized": doc = Document() parseFromDelimitedString(doc, r.content) return doc elif request_properties["outputFormat"] in [ "text", "conllu", "conll", "xml" ]: return r.text else: return r