示例#1
0
 def __init__(self,
              encoding_type: str = 'bio',
              target_pad_val=0,
              bigram=False):
     if encoding_type == 'bio':
         self.convert_tag = iob2
     else:
         self.convert_tag = lambda words: iob2bioes(iob2(words))
     self.target_pad_val = int(target_pad_val)
     self.bigram = bigram
示例#2
0
    def test_iob2bioes(self):
        tags = [
            'B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP',
            'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP',
            'I-NP'
        ]
        convert_tags = [
            'S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP',
            'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP',
            'E-NP'
        ]

        self.assertSequenceEqual(convert_tags, iob2bioes(tags))
示例#3
0
def prepare_ptb(args):
    datas = {}
    datas["pos"] = (ConllLoader(headers=["words", "pos"],
                                indexes=[0, 1]).load(args.pos).datasets)
    chunk_data = (ConllLoader(headers=["words", "chunk"],
                              indexes=[0, 2]).load(args.chunk).datasets)
    chunk_data['train'], chunk_data['dev'] = chunk_data['train'].split(0.1)
    datas['chunk'] = chunk_data
    datas["ner"] = (ConllLoader(headers=["words", "ner"],
                                indexes=[0, 3]).load(args.ner).datasets)

    for ds in datas['chunk'].values():
        ds.apply_field(lambda x: iob2(x), 'chunk', 'chunk')
    for ds in datas['ner'].values():
        ds.apply_field(lambda x: iob2bioes(iob2(x)), 'ner', 'ner')

    vocabs = {}
    src_vocab = Vocabulary()
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        filter_docstart(data)
        vocab = Vocabulary(padding=None, unknown=None)
        vocab.from_dataset(*list(data.values()), field_name=task_name)
        src_vocab.from_dataset(*list(data.values()), field_name="words")
        vocabs[task_name] = vocab

    task_lst = []
    for idx, task_name in enumerate(["pos", "chunk", "ner"]):
        data = datas[task_name]
        src_vocab.index_dataset(*list(data.values()),
                                field_name="words",
                                new_field_name="words")
        vocabs[task_name].index_dataset(*list(data.values()),
                                        field_name=task_name,
                                        new_field_name=task_name)
        for ds in data.values():
            ds.apply_field(len, 'words', 'seq_len')
        task_lst.append(
            Task(idx, task_name, data["train"], data["dev"], data["test"]))
    vocabs["words"] = src_vocab
    return task_lst, vocabs