def lexical_analysis(self, texts=[], data={}, use_gpu=False, batch_size=1, user_dict=None, return_tag=True): """ Get the word segmentation results with the texts as input Args: texts(list): the input texts to be segmented, if texts not data data(dict): key must be 'text', value is the texts to be segmented, if data not texts use_gpu(bool): whether use gpu to predict or not batch_size(int): the program deals once with one batch user_dict(None): the parameter is not to be recommended. Please set the dictionause the function set_user_dict() Returns: results(list): the word segmentation results """ if user_dict: logger.warning( "If you wanna use customized dictionary, please use the function set_user_dict() to set the dictionay. The parameter user_dict has been dropped!" ) try: _places = os.environ["CUDA_VISIBLE_DEVICES"] int(_places[0]) except: use_gpu = False if texts != [] and isinstance(texts, list) and data == {}: predicted_data = texts elif texts == [] and isinstance(data, dict) and isinstance( data.get('text', None), list) and data['text']: predicted_data = data["text"] else: raise TypeError( "The input data is inconsistent with expectations.") predicted_data = self.to_unicode(predicted_data) # drop the empty string like "" in predicted_data empty_str_indexes = self._get_index(predicted_data) predicted_data = [data for data in predicted_data if data != ""] start_idx = 0 iteration = int(math.ceil(len(predicted_data) / batch_size)) results = [] for i in range(iteration): if i < (iteration - 1): batch_data = predicted_data[start_idx:(start_idx + batch_size)] else: batch_data = predicted_data[start_idx:] start_idx = start_idx + batch_size tensor_words = self.texts2tensor(batch_data) if use_gpu: batch_out = self.gpu_predictor.run([tensor_words]) else: batch_out = self.cpu_predictor.run([tensor_words]) batch_result = parse_result(batch_data, batch_out[0], self.id2label_dict, interventer=self.interventer) results += batch_result for index in empty_str_indexes: results.insert(index, {"word": [""], "tag": [""]}) if not return_tag: for result in results: result = result.pop("tag") return results return results
def lexical_analysis(self, texts=[], data={}, use_gpu=False, batch_size=1, return_tag=True, use_device=None): """ Get the word segmentation results with the texts as input Args: texts(list): the input texts to be segmented, if texts not data data(dict): key must be 'text', value is the texts to be segmented, if data not texts use_gpu(bool): whether use gpu to predict or not batch_size(int): the program deals once with one batch return_tag: Whether to get tag or not. use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag. Returns: results(list): the word segmentation results """ # real predictor to use if use_device is not None: if use_device == "cpu": predictor = self.cpu_predictor elif use_device == "xpu": predictor = self.xpu_predictor elif use_device == "npu": predictor = self.npu_predictor elif use_device == "gpu": predictor = self.gpu_predictor else: raise Exception("Unsupported device: " + use_device) else: # use_device is not set, therefore follow use_gpu if use_gpu: predictor = self.gpu_predictor else: predictor = self.cpu_predictor if texts != [] and isinstance(texts, list) and data == {}: predicted_data = texts elif texts == [] and isinstance(data, dict) and isinstance( data.get('text', None), list) and data['text']: predicted_data = data["text"] else: raise TypeError( "The input data is inconsistent with expectations.") predicted_data = self.to_unicode(predicted_data) # drop the empty string like "" in predicted_data empty_str_indexes = self._get_index(predicted_data) predicted_data = [data for data in predicted_data if data != ""] start_idx = 0 iteration = int(math.ceil(len(predicted_data) / batch_size)) results = [] for i in range(iteration): if i < (iteration - 1): batch_data = predicted_data[start_idx:(start_idx + batch_size)] else: batch_data = predicted_data[start_idx:] start_idx = start_idx + batch_size batch_out = self._internal_predict(predictor, batch_data) batch_result = parse_result(batch_data, batch_out, self.id2label_dict, interventer=self.custom) results += batch_result for index in empty_str_indexes: results.insert(index, {"word": [""], "tag": [""]}) if not return_tag: for result in results: result = result.pop("tag") return results return results
def cut(self, text, use_gpu=False, batch_size=1, return_tag=True): """ The main function that segments an entire text that contains Chinese characters into separated words. Args: text(:obj:`str` or :obj:`List[str]`): The chinese texts to be segmented. This can be a string, a list of strings. use_gpu(bool): whether use gpu to predict or not batch_size(int): the program deals once with one batch return_tag: Whether to get tag or not. Returns: results(dict or list): The word segmentation result of the input text, whose key is 'word', if text is a list. If text is a str, the word segmentation result (list) is obtained. """ if use_gpu: try: _places = os.environ["CUDA_VISIBLE_DEVICES"] int(_places[0]) except: raise RuntimeError( "Environment Variable CUDA_VISIBLE_DEVICES is not set correctly. If you wanna use gpu, please set CUDA_VISIBLE_DEVICES as cuda_device_id." ) if isinstance(text, list) and len(text) != 0: predicted_data = self.to_unicode(text) # drop the empty string like "" in predicted_data empty_str_indexes = self._get_index(predicted_data) predicted_data = [data for data in predicted_data if data != ""] start_idx = 0 iteration = int(math.ceil(len(predicted_data) / batch_size)) results = [] for i in range(iteration): if i < (iteration - 1): batch_data = predicted_data[start_idx:(start_idx + batch_size)] else: batch_data = predicted_data[start_idx:] start_idx = start_idx + batch_size tensor_words = self.texts2tensor(batch_data) if use_gpu: batch_out = self.gpu_predictor.run([tensor_words]) else: batch_out = self.cpu_predictor.run([tensor_words]) batch_result = parse_result(batch_data, batch_out[0], self.id2label_dict, interventer=self.custom) results += batch_result for index in empty_str_indexes: results.insert(index, {"word": [""], "tag": [""]}) if not return_tag: for result in results: result = result.pop("tag") return results return results elif isinstance(text, str) and text != "": tensor_words = self.texts2tensor([text]) if use_gpu: batch_out = self.gpu_predictor.run([tensor_words]) else: batch_out = self.cpu_predictor.run([tensor_words]) batch_result = parse_result([text], batch_out[0], self.id2label_dict, interventer=self.custom) return batch_result[0]['word'] elif text == "": return text else: raise TypeError( "The input data is inconsistent with expectations.")
def cut(self, text, use_gpu=False, batch_size=1, return_tag=True, use_device=None): """ The main function that segments an entire text that contains Chinese characters into separated words. Args: text(:obj:`str` or :obj:`List[str]`): The chinese texts to be segmented. This can be a string, a list of strings. use_gpu(bool): whether use gpu to predict or not batch_size(int): the program deals once with one batch return_tag: Whether to get tag or not. use_device (str): use cpu, gpu, xpu or npu, overwrites use_gpu flag. Returns: results(dict or list): The word segmentation result of the input text, whose key is 'word', if text is a list. If text is a str, the word segmentation result (list) is obtained. """ # real predictor to use if use_device is not None: if use_device == "cpu": predictor = self.cpu_predictor elif use_device == "xpu": predictor = self.xpu_predictor elif use_device == "npu": predictor = self.npu_predictor elif use_device == "gpu": predictor = self.gpu_predictor else: raise Exception("Unsupported device: " + use_device) else: # use_device is not set, therefore follow use_gpu if use_gpu: predictor = self.gpu_predictor else: predictor = self.cpu_predictor if isinstance(text, list) and len(text) != 0: predicted_data = self.to_unicode(text) # drop the empty string like "" in predicted_data empty_str_indexes = self._get_index(predicted_data) predicted_data = [data for data in predicted_data if data != ""] start_idx = 0 iteration = int(math.ceil(len(predicted_data) / batch_size)) results = [] for i in range(iteration): if i < (iteration - 1): batch_data = predicted_data[start_idx:(start_idx + batch_size)] else: batch_data = predicted_data[start_idx:] start_idx = start_idx + batch_size batch_out = self._internal_predict(predictor, batch_data) batch_result = parse_result(batch_data, batch_out, self.id2label_dict, interventer=self.custom) results += batch_result for index in empty_str_indexes: results.insert(index, {"word": [""], "tag": [""]}) if not return_tag: for result in results: result = result.pop("tag") return results return results elif isinstance(text, str) and text != "": batch_out = self._internal_predict(predictor, [text]) batch_result = parse_result([text], batch_out, self.id2label_dict, interventer=self.custom) return batch_result[0]['word'] elif text == "": return text else: raise TypeError( "The input data is inconsistent with expectations.")