Exemplo n.º 1
0
 def words_to_indices(self, words):
   chars_idx = []
   for word in words:
     chars = [self.char_dict[char]
              if char in self.char_dict else self.char_dict[UNK]
              for char in word]
     chars_idx.append(chars)
     words = [word_convert(word) for word in words]
     words_idx = [self.word_dict[word]
                  if word in self.word_dict else self.word_dict[UNK]
                  for word in words]
     return self.batcher.make_each_batch([words_idx], [chars_idx])
Exemplo n.º 2
0
 def _make_ids(_words):
     _char_ids = []
     _word_ids = []
     for word in _words:
         _char_ids.append([
             self.char_dict[char]
             if char in self.char_dict else self.char_dict[UNK]
             for char in word
         ])
         word = word_convert(word, keep_number=False, lowercase=True)
         _word_ids.append(self.word_dict[word] if word in
                          self.word_dict else self.word_dict[UNK])
     return _char_ids, _word_ids
Exemplo n.º 3
0
 def load_dataset(self, filename, keep_number=False, lowercase=True):
     dataset = []
     for record in load_json(filename):
         words = [
             word_convert(word,
                          keep_number=keep_number,
                          lowercase=lowercase) for word in record["words"]
         ]
         dataset.append({
             "sent_id": record["sent_id"],
             "words": words,
             "tags": record["spans"]
         })
     return dataset
Exemplo n.º 4
0
def raw_dataset_iter(filename, encoding="utf-8"):
    with codecs.open(filename, mode="r", encoding=encoding) as f:
        words, tags = [], []
        for line in f:
            line = line.lstrip().rstrip()
            if len(line) == 0 or line.startswith(
                    "--------------"):  # means read whole one sentence
                if len(words) != 0:
                    yield words, tags
                    words, tags = [], []
            else:
                _, word, tag = line.split("\t")
                word = word_convert(word)
                words.append(word)
                tags.append(tag)
Exemplo n.º 5
0
 def build_dataset(data, word_dict, char_dict, tag_dict):
     dataset = []
     for record in data:
         chars_list = []
         words = []
         for word in record["words"]:
             chars = [
                 char_dict[char] if char in char_dict else char_dict[UNK]
                 for char in word
             ]
             chars_list.append(chars)
             word = word_convert(word, keep_number=False, lowercase=True)
             words.append(word_dict[word] if word in
                          word_dict else word_dict[UNK])
         tags = [(tag_dict[tag], i, j) for (tag, i, j) in record["tags"]]
         dataset.append({"words": words, "chars": chars_list, "tags": tags})
     return dataset
 def raw_dataset_iter(filename, keep_number, lowercase):
   with codecs.open(filename, mode="r", encoding="utf-8") as f:
     words, tags = [], []
     for line in f:
       line = line.lstrip().rstrip()
       if line.startswith("-DOCSTART-"):
         continue
       if len(line) == 0:
         if len(words) != 0:
           yield words, tags
           words, tags = [], []
       else:
         line = line.split()
         word = line[0]
         tag = line[-1]
         word = word_convert(word, keep_number=keep_number,
                             lowercase=lowercase)
         words.append(word)
         tags.append(tag)
Exemplo n.º 7
0
def raw_dataset_iter(dataset, task_name, keep_number, lowercase):

    for sentence in dataset:
        words, tags = [], []
        for word_details in sentence:
            # print(word_details)

            word, pos, chunk, ner = word_details[0:4]
            if task_name == "ner":
                tag = ner
            elif task_name == "chunk":
                tag = chunk
            else:
                tag = pos
            word = word_convert(word,
                                keep_number=keep_number,
                                lowercase=lowercase)
            words.append(word)
            tags.append(tag)
        yield words, tags
Exemplo n.º 8
0
    def words_to_indices(self, words):
        """
		Convert input words into batchnized word/chars indices for inference
		:param words: input words
		:return: batchnized word indices
		"""
        chars_idx = []
        for word in words:
            chars = [
                self.char_dict[char]
                if char in self.char_dict else self.char_dict[UNK]
                for char in word
            ]
            chars_idx.append(chars)
        words = [word_convert(word) for word in words]
        words_idx = [
            self.word_dict[word]
            if word in self.word_dict else self.word_dict[UNK]
            for word in words
        ]
        return process_batch_data([words_idx], [chars_idx])
Exemplo n.º 9
0
def raw_dataset_iter(filename, task_name, keep_number, lowercase):
    with codecs.open(filename, mode="r", encoding="utf-8") as f:
        words, tags = [], []
        for line in f:
            line = line.lstrip().rstrip()
            if len(line) == 0 or line.startswith(
                    "-DOCSTART-"):  # means read whole one sentence
                if len(words) != 0:
                    yield words, tags
                    words, tags = [], []
            else:
                word, pos, chunk, ner = line.split(" ")
                if task_name == "ner":
                    tag = ner
                elif task_name == "chunk":
                    tag = chunk
                else:
                    tag = pos
                word = word_convert(word,
                                    keep_number=keep_number,
                                    lowercase=lowercase)
                words.append(word)
                tags.append(tag)