Пример #1
0
 def read_txt(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     # vocab = set() ## build the vocabulary
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         tags = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 insts.append(Instance(Sentence(words, None, None, tags), labels))
                 words = []
                 labels = []
                 tags = []
                 if len(insts) == number:
                     break
                 continue
             if "conll2003" in file:
                 word, pos, label = line.split()
             else:
                 vals = line.split()
                 word = vals[1]
                 pos = vals[3]
                 label = vals[10]
             if self.digit2zero:
                 word = re.sub('\d', '0', word) # replace digit with 0.
             words.append(word)
             tags.append(pos)
             self.vocab.add(word)
             labels.append(label)
     print("number of sentences: {}".format(len(insts)))
     return insts
Пример #2
0
 def read_from_file(self, file, number=-1, is_train=True):
     print("Reading file: " + file)
     insts = []
     # vocab = set() ## build the vocabulary
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 insts.append(Instance(Sentence(words), labels))
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             word, _, label = line.split()
             if self.digit2zero:
                 word = re.sub('\d', '0', word)
             words.append(word)
             if is_train:
                 self.train_vocab[word]=0
             else:
                 self.test_vocab[word]=0
             labels.append(label)
     return insts
Пример #3
0
 def read_from_file(self, file, number=-1, is_train=True):
     print("Reading file: " + file)
     insts = []
     # vocab = set() ## build the vocabulary
     id = 0
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         labels = []
         # for line in f.readlines():
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 inst = Instance(Sentence(words), labels)
                 inst.set_id(id)
                 id += 1
                 insts.append(inst)
                 words = []
                 labels = []
                 if len(insts) == number:
                     break
                 continue
             if self.dataset == "conll2003":
                 word, _, label = line.split()
             elif self.dataset == "conll2002" or self.dataset == "ecommerce" or self.dataset == "youku":
                 x = line.split()
                 if len(x) == 1:
                     word = ","
                     label = x[0]
                 else:
                     word = x[0]
                     label = x[1]
                 # word, label = line.split()
             else:
                 raise Exception("unknown dataset: " + self.dataset +
                                 " during read data")
             if self.digit2zero:
                 word = re.sub('\d', '0', word)
             words.append(word)
             if is_train:
                 if word not in self.train_vocab:
                     self.train_vocab.append(word)
             else:
                 if word not in self.test_vocab:
                     self.test_vocab.append(word)
             labels.append(label)
     return insts
Пример #4
0
def read_conll(res_file: str, number: int = -1) -> List[Instance]:
    print("Reading file: " + res_file)
    insts = []
    # vocab = set() ## build the vocabulary
    with open(res_file, 'r', encoding='utf-8') as f:
        words = []
        heads = []
        deps = []
        labels = []
        tags = []
        preds = []
        for line in tqdm(f.readlines()):
            line = line.rstrip()
            if line == "":
                inst = Instance(Sentence(words, heads, deps, tags), labels)
                inst.prediction = preds
                insts.append(inst)
                words = []
                heads = []
                deps = []
                labels = []
                tags = []
                preds = []

                if len(insts) == number:
                    break
                continue
            vals = line.split()
            word = vals[1]
            pos = vals[2]
            head = int(vals[3])
            dep_label = vals[4]

            label = vals[5]
            pred_label = vals[6]

            words.append(word)
            heads.append(head)  ## because of 0-indexed.
            deps.append(dep_label)
            tags.append(pos)
            labels.append(label)
            preds.append(pred_label)
    print("number of sentences: {}".format(len(insts)))
    return insts
Пример #5
0
 def read_conll(self, file: str, number: int = -1, is_train: bool = True) -> List[Instance]:
     print("Reading file: " + file)
     insts = []
     num_entity = 0
     # vocab = set() ## build the vocabulary
     find_root = False
     with open(file, 'r', encoding='utf-8') as f:
         words = []
         heads = []
         deps = []
         labels = []
         tags = []
         for line in tqdm(f.readlines()):
             line = line.rstrip()
             if line == "":
                 insts.append(Instance(Sentence(words, heads, deps, tags), labels))
                 words = []
                 heads = []
                 deps = []
                 labels = []
                 tags = []
                 find_root = False
                 if len(insts) == number:
                     break
                 continue
             # if "conll2003" in file:
             #     word, pos, head, dep_label, label = line.split()
             # else:
             vals = line.split()
             word = vals[1]
             head = int(vals[6])
             dep_label = vals[7]
             pos = vals[3]
             label = vals[10]
             if self.digit2zero:
                 word = re.sub('\d', '0', word) # replace digit with 0.
             words.append(word)
             if head == 0 and find_root:
                 raise err("already have a root")
             heads.append(head - 1) ## because of 0-indexed.
             deps.append(dep_label)
             tags.append(pos)
             self.vocab.add(word)
             labels.append(label)
             if label.startswith("B-"):
                 num_entity +=1
     print("number of sentences: {}, number of entities: {}".format(len(insts), num_entity))
     return insts