def load_file_pipeline(self, pipeline: str, pipeline_type: str): """Load and convert the sent pipeline to Json.. Arguments: pipeline {str} -- Path of the file containing the pipeline. pipeline_type {str} -- File type: [json, yaml, xml] Returns: {json} -- [The pipeline is always returned in Json format.] """ try: result = "" if pipeline_type == "json": result = Util().openfile_json(pipeline) elif pipeline_type == "yaml": result = OutputFormat().yaml2json(pipeline) elif pipeline_type == "xml": result = OutputFormat().xml2json(pipeline) elif pipeline_type == "url": with urllib.request.urlopen(pipeline) as url: result = json.loads(url.read().decode()) elif pipeline_type == "json_str": result = json.loads(pipeline) log.logger.info("Custom Pipeline: {}".format(result)) return result except Exception as err: log.logger.error(err) sys.exit(0)
def prepare_raw_text(self, raw_text): log.logger.info( "Pre-processing - Execute Sentence Split in texto raw.") list_sentences = list() # pre-processing tokenization and ssplit using plugin base selected. if self._tool_base == "stanza": doc_annotation = PluginManager().call_plugin_nlp( plugin_name="preprocessing", document=raw_text, pipeline={ "lang": self._custom_pipeline["lang"], "tools": { "stanza": { "processors": ["tokenize"] } }, }, ) # join tokens. for sentence in doc_annotation.sentences: list_tokens = list() for token in sentence.tokens: list_tokens.append(token.text) list_sentences.append(" ".join(list_tokens)) if self._tool_base == "stanfordcorenlp": doc_annotation = PluginManager().call_plugin_nlp( plugin_name="preprocessing", document=raw_text, pipeline={ "lang": self._custom_pipeline["lang"], "tools": { "stanfordcorenlp": { "processors": ["ssplit"] } }, }, ) for item in doc_annotation[0]["sentences"]: sentence = list() for token in item["tokens"]: sentence.append(token["word"]) list_sentences.append(" ".join(sentence)) if self._use_db != None: # insert dataset in database. self.ID_DATASET = PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="dataset", document={ "name": "dataset_" + RandomObjectId().gen_random_object_id_string(), "data_time": OutputFormat().data_time(), }, ) # insert document(s) in database. PluginManager().call_plugin_db( plugin_name=self._use_db, operation="insert", collection="document", document={ "_id_dataset": self.ID_DATASET, "name": "document_" + RandomObjectId().gen_random_object_id_string(), "sentences": [sentence for sentence in list_sentences], }, ) return self.get_all_documents_database() # Not using database. else: # generates a document id. _id = RandomObjectId().gen_random_object_id() # generate an id for the dataset. self.ID_DATASET = RandomObjectId().gen_random_object_id() # generates a name for the document. name = "document_" + RandomObjectId().gen_random_object_id_string() document = { "_id": _id, "_id_dataset": self.ID_DATASET, "name": name, "sentences": list_sentences, } return [document]
def type_output_file(self, annotation): if self._format == "xml": return OutputFormat().json2xml(annotation) elif self._format == "json": return annotation