def build_dataset(data, word_dict, char_dict, tag_dict): dataset = [] for record in data: chars_list = [] words = [] for word in record["words"]: chars = [char_dict[char] if char in char_dict else char_dict[UNK] for char in word] chars_list.append(chars) word = word_convert(word, keep_number=False, lowercase=True) words.append(word_dict[word] if word in word_dict else word_dict[UNK]) tags = [tag_dict[tag] for tag in record["tags"]] dataset.append({"words": words, "chars": chars_list, "tags": tags}) return dataset
def raw_dataset_iter(filename, encoding="utf-8"): with codecs.open(filename, mode="r", encoding=encoding) as f: words, tags = [], [] for line in f: line = line.lstrip().rstrip() if len(line) == 0 or line.startswith( "--------------"): # means read whole one sentence if len(words) != 0: yield words, tags words, tags = [], [] else: _, word, tag = line.split("\t") word = word_convert(word, language="french") words.append(word) tags.append(tag)
def raw_dataset_iter(filename, task_name, keep_number, lowercase): with codecs.open(filename, mode="r", encoding="utf-8") as f: words, tags = [], [] for line in f: line = line.lstrip().rstrip() if len(line) == 0 or line.startswith( "-DOCSTART-"): # means read whole one sentence if len(words) != 0: yield words, tags words, tags = [], [] else: word, ner = line.split(" ") tag = ner word = word_convert(word, keep_number=keep_number, lowercase=lowercase) words.append(word) tags.append(tag)
def words_to_indices(self, words): """ Convert input words into batchnized word/chars indices for inference :param words: input words :return: batchnized word indices """ chars_idx = [] for word in words: chars = [ self.char_dict[char] if char in self.char_dict else self.char_dict[UNK] for char in word ] chars_idx.append(chars) words = [ word_convert(word, language=self.cfg["language"]) for word in words ] words_idx = [ self.word_dict[word] if word in self.word_dict else self.word_dict[UNK] for word in words ] return process_batch_data([words_idx], [chars_idx])