示例#1
0
def test_write_protobuf(doc_pb):
    stream = writeToDelimitedString(doc_pb)
    buf = stream.getvalue()
    stream.close()

    doc_pb_ = Document()
    parseFromDelimitedString(doc_pb_, buf)
    assert doc_pb == doc_pb_
示例#2
0
def doc_pb():
    test_dir = os.path.dirname(os.path.abspath(__file__))
    test_data = os.path.join(test_dir, 'data', 'test.dat')
    with open(test_data, 'rb') as f:
        buf = f.read()
    doc = Document()
    parseFromDelimitedString(doc, buf)
    return doc
示例#3
0
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = {}
            properties.update({
                'inputFormat': 'serialized',
                'outputFormat': 'serialized',
                'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        if annotators:
            properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
示例#4
0
    def annotate(self, text, annotators=None, output_format=None, properties=None, reset_default=None, **kwargs):
        """
        Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) reset_default: don't use server defaults

        Precedence for settings:

        1. annotators and output_format args
        2. Values from properties dict
        3. Client defaults self.annotators and self.output_format (set during client construction)
        4. Server defaults

        Additional request parameters (apart from CoreNLP pipeline properties) such as 'username' and 'password'
        can be specified with the kwargs.

        :return: request result
        """

        # validate request properties
        validate_corenlp_props(properties=properties, annotators=annotators, output_format=output_format)
        # set request properties
        request_properties = {}

        # start with client defaults
        if self.annotators is not None:
            request_properties['annotators'] = self.annotators
        if self.output_format is not None:
            request_properties['outputFormat'] = self.output_format

        # add values from properties arg
        # handle str case
        if type(properties) == str:
            if is_corenlp_lang(properties):
                properties = {'pipelineLanguage': properties.lower()}
                if reset_default is None:
                    reset_default = True
            else:
                raise ValueError(f"Unrecognized properties keyword {properties}")

        if type(properties) == dict:
            request_properties.update(properties)

        # if annotators list is specified, override with that
        # also can use the annotators field the object was created with
        if annotators is not None and (type(annotators) == str or type(annotators) == list):
            request_properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)

        # if output format is specified, override with that
        if output_format is not None and type(output_format) == str:
            request_properties['outputFormat'] = output_format

        # make the request
        # if not explictly set or the case of pipelineLanguage, reset_default should be None
        if reset_default is None:
            reset_default = False
        r = self._request(text.encode('utf-8'), request_properties, reset_default, **kwargs)
        if request_properties["outputFormat"] == "json":
            return r.json()
        elif request_properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
            return r.text
        else:
            return r
    def annotate(self, text, annotators=None, output_format=None, properties_key=None, properties=None, **kwargs):
        """
        Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)

        The properties for a request are written in this order:

        1. Server default properties (server side)
        2. Properties from client's properties_cache corresponding to properties_key (client side)
           If the properties_key is the name of a Stanford CoreNLP supported language:
           [Arabic, Chinese, English, French, German, Spanish], the Stanford CoreNLP defaults will be used (server side)
        3. Additional properties corresponding to properties (client side)
        4. Special case specific properties: annotators, output_format (client side)

        :return: request result
        """
        # set properties for server call
        # first look for a cached default properties set
        # if a Stanford CoreNLP supported language is specified, just pass {pipelineLanguage="french"}
        if properties_key is not None:
            if properties_key.lower() in ['en', 'english']:
                request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES)
            elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES:
                request_properties = {'pipelineLanguage': properties_key.lower()}
            else:
                request_properties = dict(self.properties_cache.get(properties_key, {}))
        else:
            request_properties = {}
        # add on custom properties for this request
        if properties is None:
            properties = {}
        request_properties.update(properties)
        # if annotators list is specified, override with that
        if annotators is not None:
            request_properties['annotators'] = ",".join(annotators) if isinstance(annotators, list) else annotators
        # always send an output format with request
        # in some scenario's the server's default output format is unknown, so default to serialized
        if output_format is not None:
            request_properties['outputFormat'] = output_format
        if request_properties.get('outputFormat') is None:
            if self.server_start_info.get('props', {}).get('outputFormat'):
                request_properties['outputFormat'] = self.server_start_info['props']['outputFormat']
            else:
                request_properties['outputFormat'] = CoreNLPClient.DEFAULT_OUTPUT_FORMAT
        # make the request
        r = self._request(text.encode('utf-8'), request_properties, **kwargs)
        if request_properties["outputFormat"] == "json":
            return r.json()
        elif request_properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif request_properties["outputFormat"] in ["text", "conllu", "conll", "xml"]:
            return r.text
        else:
            return r