示例#1
0
    def load_file_pipeline(self, pipeline: str, pipeline_type: str):
        """Load and convert the sent pipeline to Json..

        Arguments:
            pipeline {str} -- Path of the file containing the pipeline.
            pipeline_type {str} -- File type: [json, yaml, xml]

        Returns:
            {json} -- [The pipeline is always returned in Json format.]
        """
        try:
            result = ""

            if pipeline_type == "json":
                result = Util().openfile_json(pipeline)
            elif pipeline_type == "yaml":
                result = OutputFormat().yaml2json(pipeline)
            elif pipeline_type == "xml":
                result = OutputFormat().xml2json(pipeline)
            elif pipeline_type == "url":
                with urllib.request.urlopen(pipeline) as url:
                    result = json.loads(url.read().decode())
            elif pipeline_type == "json_str":
                result = json.loads(pipeline)

            log.logger.info("Custom Pipeline: {}".format(result))
            return result
        except Exception as err:
            log.logger.error(err)
            sys.exit(0)
示例#2
0
    def prepare_raw_text(self, raw_text):
        log.logger.info(
            "Pre-processing - Execute Sentence Split in texto raw.")

        list_sentences = list()

        # pre-processing tokenization and ssplit using plugin base selected.
        if self._tool_base == "stanza":
            doc_annotation = PluginManager().call_plugin_nlp(
                plugin_name="preprocessing",
                document=raw_text,
                pipeline={
                    "lang": self._custom_pipeline["lang"],
                    "tools": {
                        "stanza": {
                            "processors": ["tokenize"]
                        }
                    },
                },
            )

            # join tokens.
            for sentence in doc_annotation.sentences:
                list_tokens = list()
                for token in sentence.tokens:
                    list_tokens.append(token.text)
                list_sentences.append(" ".join(list_tokens))

        if self._tool_base == "stanfordcorenlp":
            doc_annotation = PluginManager().call_plugin_nlp(
                plugin_name="preprocessing",
                document=raw_text,
                pipeline={
                    "lang": self._custom_pipeline["lang"],
                    "tools": {
                        "stanfordcorenlp": {
                            "processors": ["ssplit"]
                        }
                    },
                },
            )

            for item in doc_annotation[0]["sentences"]:
                sentence = list()
                for token in item["tokens"]:
                    sentence.append(token["word"])
                list_sentences.append(" ".join(sentence))

        if self._use_db != None:
            # insert dataset in database.
            self.ID_DATASET = PluginManager().call_plugin_db(
                plugin_name=self._use_db,
                operation="insert",
                collection="dataset",
                document={
                    "name":
                    "dataset_" +
                    RandomObjectId().gen_random_object_id_string(),
                    "data_time":
                    OutputFormat().data_time(),
                },
            )

            # insert document(s) in database.
            PluginManager().call_plugin_db(
                plugin_name=self._use_db,
                operation="insert",
                collection="document",
                document={
                    "_id_dataset":
                    self.ID_DATASET,
                    "name":
                    "document_" +
                    RandomObjectId().gen_random_object_id_string(),
                    "sentences": [sentence for sentence in list_sentences],
                },
            )

            return self.get_all_documents_database()

        # Not using database.
        else:
            # generates a document id.
            _id = RandomObjectId().gen_random_object_id()

            # generate an id for the dataset.
            self.ID_DATASET = RandomObjectId().gen_random_object_id()

            # generates a name for the document.
            name = "document_" + RandomObjectId().gen_random_object_id_string()

            document = {
                "_id": _id,
                "_id_dataset": self.ID_DATASET,
                "name": name,
                "sentences": list_sentences,
            }

            return [document]
示例#3
0
 def type_output_file(self, annotation):
     if self._format == "xml":
         return OutputFormat().json2xml(annotation)
     elif self._format == "json":
         return annotation