예제 #1
0
    def update(self, doc, annotators=None, properties=None):
        if properties is None:
            properties = {}
            properties.update({
                'inputFormat': 'serialized',
                'outputFormat': 'serialized',
                'serializer': 'edu.stanford.nlp.pipeline.ProtobufAnnotationSerializer'
            })
        if annotators:
            properties['annotators'] = annotators if type(annotators) == str else ",".join(annotators)
        with io.BytesIO() as stream:
            writeToDelimitedString(doc, stream)
            msg = stream.getvalue()

        r = self._request(msg, properties)
        doc = Document()
        parseFromDelimitedString(doc, r.content)
        return doc
예제 #2
0
    def annotate(self,
                 text,
                 annotators=None,
                 output_format=None,
                 properties=None,
                 reset_default=None,
                 **kwargs):
        """
        Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (dict) properties: additional request properties (written on top of defaults)
        :param (bool) reset_default: don't use server defaults

        Precedence for settings:

        1. annotators and output_format args
        2. Values from properties dict
        3. Client defaults self.annotators and self.output_format (set during client construction)
        4. Server defaults

        Additional request parameters (apart from CoreNLP pipeline properties) such as 'username' and 'password'
        can be specified with the kwargs.

        :return: request result
        """

        # validate request properties
        validate_corenlp_props(properties=properties,
                               annotators=annotators,
                               output_format=output_format)
        # set request properties
        request_properties = {}

        # start with client defaults
        if self.annotators is not None:
            request_properties['annotators'] = self.annotators
        if self.output_format is not None:
            request_properties['outputFormat'] = self.output_format

        # add values from properties arg
        # handle str case
        if type(properties) == str:
            if is_corenlp_lang(properties):
                properties = {'pipelineLanguage': properties.lower()}
                if reset_default is None:
                    reset_default = True
            else:
                raise ValueError(
                    f"Unrecognized properties keyword {properties}")

        if type(properties) == dict:
            request_properties.update(properties)

        # if annotators list is specified, override with that
        # also can use the annotators field the object was created with
        if annotators is not None and (type(annotators) == str
                                       or type(annotators) == list):
            request_properties['annotators'] = annotators if type(
                annotators) == str else ",".join(annotators)

        # if output format is specified, override with that
        if output_format is not None and type(output_format) == str:
            request_properties['outputFormat'] = output_format

        # make the request
        # if not explicitly set or the case of pipelineLanguage, reset_default should be None
        if reset_default is None:
            reset_default = False
        r = self._request(text.encode('utf-8'), request_properties,
                          reset_default, **kwargs)
        if request_properties["outputFormat"] == "json":
            return r.json()
        elif request_properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif request_properties["outputFormat"] in [
                "text", "conllu", "conll", "xml"
        ]:
            return r.text
        else:
            return r
예제 #3
0
    def annotate(self,
                 text,
                 annotators=None,
                 output_format=None,
                 properties_key=None,
                 properties=None,
                 **kwargs):
        """
        Send a request to the CoreNLP server.

        :param (str | unicode) text: raw text for the CoreNLPServer to parse
        :param (list | string) annotators: list of annotators to use
        :param (str) output_format: output type from server: serialized, json, text, conll, conllu, or xml
        :param (str) properties_key: key into properties cache for the client
        :param (dict) properties: additional request properties (written on top of defaults)

        The properties for a request are written in this order:

        1. Server default properties (server side)
        2. Properties from client's properties_cache corresponding to properties_key (client side)
           If the properties_key is the name of a Stanford CoreNLP supported language:
           [Arabic, Chinese, English, French, German, Spanish], the Stanford CoreNLP defaults will be used (server side)
        3. Additional properties corresponding to properties (client side)
        4. Special case specific properties: annotators, output_format (client side)

        :return: request result
        """
        # set properties for server call
        # first look for a cached default properties set
        # if a Stanford CoreNLP supported language is specified, just pass {pipelineLanguage="french"}
        if properties_key is not None:
            if properties_key.lower() in ['en', 'english']:
                request_properties = dict(ENGLISH_DEFAULT_REQUEST_PROPERTIES)
            elif properties_key.lower() in CoreNLPClient.PIPELINE_LANGUAGES:
                request_properties = {
                    'pipelineLanguage': properties_key.lower()
                }
            elif properties_key not in self.properties_cache:
                raise ValueError("Properties cache does not have '%s'" %
                                 properties_key)
            else:
                request_properties = dict(
                    self.properties_cache[properties_key])
        else:
            request_properties = {}
        # add on custom properties for this request
        if properties is None:
            properties = {}
        request_properties.update(properties)
        # if annotators list is specified, override with that
        if annotators is not None:
            request_properties['annotators'] = ",".join(
                annotators) if isinstance(annotators, list) else annotators
        # always send an output format with request
        # in some scenario's the server's default output format is unknown, so default to serialized
        if output_format is not None:
            request_properties['outputFormat'] = output_format
        if request_properties.get('outputFormat') is None:
            if self.server_start_info.get('props', {}).get('outputFormat'):
                request_properties['outputFormat'] = self.server_start_info[
                    'props']['outputFormat']
            else:
                request_properties[
                    'outputFormat'] = CoreNLPClient.DEFAULT_OUTPUT_FORMAT
        # make the request
        r = self._request(text.encode('utf-8'), request_properties, **kwargs)
        if request_properties["outputFormat"] == "json":
            return r.json()
        elif request_properties["outputFormat"] == "serialized":
            doc = Document()
            parseFromDelimitedString(doc, r.content)
            return doc
        elif request_properties["outputFormat"] in [
                "text", "conllu", "conll", "xml"
        ]:
            return r.text
        else:
            return r