示例#1
0
    def process(self,
                paths,
                train_ds: Iterable[str] = None,
                src_vocab_op: VocabularyOption = None,
                tgt_vocab_op: VocabularyOption = None,
                src_embed_op: EmbeddingOption = None):
        input_name, target_name = 'words', 'target'
        src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(
            **src_vocab_op)
        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op)

        info = DataBundle(datasets=self.load(paths))
        _train_ds = [info.datasets[name] for name in train_ds
                     ] if train_ds else info.datasets.values()
        src_vocab.from_dataset(*_train_ds, field_name=input_name)
        tgt_vocab.from_dataset(*_train_ds, field_name=target_name)
        src_vocab.index_dataset(*info.datasets.values(),
                                field_name=input_name,
                                new_field_name=input_name)
        tgt_vocab.index_dataset(*info.datasets.values(),
                                field_name=target_name,
                                new_field_name=target_name)
        info.vocabs = {input_name: src_vocab, target_name: tgt_vocab}

        if src_embed_op is not None:
            src_embed_op.vocab = src_vocab
            init_emb = EmbedLoader.load_with_vocab(**src_embed_op)
            info.embeddings[input_name] = init_emb

        for name, dataset in info.datasets.items():
            dataset.set_input(input_name)
            dataset.set_target(target_name)
        return info
示例#2
0
    def process(self,
                paths: Union[str, Dict[str, str]],
                src_vocab_opt: VocabularyOption = None,
                tgt_vocab_opt: VocabularyOption = None,
                src_embed_opt: EmbeddingOption = None,
                char_level_op=False):

        paths = check_dataloader_paths(paths)
        datasets = {}
        info = DataBundle()
        for name, path in paths.items():
            dataset = self.load(path)
            datasets[name] = dataset

        def wordtochar(words):
            chars = []
            for word in words:
                word = word.lower()
                for char in word:
                    chars.append(char)
                chars.append('')
            chars.pop()
            return chars

        input_name, target_name = 'words', 'target'
        info.vocabs = {}

        # 就分隔为char形式
        if char_level_op:
            for dataset in datasets.values():
                dataset.apply_field(wordtochar,
                                    field_name="words",
                                    new_field_name='chars')
        src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(
            **src_vocab_opt)
        src_vocab.from_dataset(datasets['train'], field_name='words')
        src_vocab.index_dataset(*datasets.values(), field_name='words')

        tgt_vocab = Vocabulary(unknown=None, padding=None) \
            if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt)
        tgt_vocab.from_dataset(datasets['train'], field_name='target')
        tgt_vocab.index_dataset(*datasets.values(), field_name='target')

        info.vocabs = {"words": src_vocab, "target": tgt_vocab}

        info.datasets = datasets

        if src_embed_opt is not None:
            embed = EmbedLoader.load_with_vocab(**src_embed_opt,
                                                vocab=src_vocab)
            info.embeddings['words'] = embed

        for name, dataset in info.datasets.items():
            dataset.set_input("words")
            dataset.set_target("target")

        return info
示例#3
0
def _indexize(data_bundle,
              input_field_names=Const.INPUT,
              target_field_names=Const.TARGET,
              vocabulary=None):
    if isinstance(input_field_names, str):
        input_field_names = [input_field_names]
    if isinstance(target_field_names, str):
        target_field_names = [target_field_names]
    for input_field_name in input_field_names:
        if vocabulary is None:
            src_vocab = Vocabulary()
            src_vocab.from_dataset(
                *[
                    ds for name, ds in data_bundle.iter_datasets()
                    if 'train' in name
                ],
                field_name=input_field_name,
                no_create_entry_dataset=[
                    ds for name, ds in data_bundle.iter_datasets()
                    if ('train' not in name) and (
                        ds.has_field(input_field_name))
                ])

        else:
            src_vocab = vocabulary
        src_vocab.index_dataset(*data_bundle.datasets.values(),
                                field_name=input_field_name)
        data_bundle.set_vocab(src_vocab, input_field_name)

    for target_field_name in target_field_names:
        tgt_vocab = Vocabulary(unknown=None, padding=None)
        tgt_vocab.from_dataset(
            *[
                ds for name, ds in data_bundle.iter_datasets()
                if 'train' in name
            ],
            field_name=target_field_name,
            no_create_entry_dataset=[
                ds for name, ds in data_bundle.iter_datasets()
                if ('train' not in name) and (ds.has_field(target_field_name))
            ])
        if len(tgt_vocab._no_create_word) > 0:
            warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \
                       f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \
                       f"data set but not in train data set!.\n" \
                       f"These label(s) are {tgt_vocab._no_create_word}"
            print(warn_msg)
        tgt_vocab.index_dataset(*[
            ds for ds in data_bundle.datasets.values()
            if ds.has_field(target_field_name)
        ],
                                field_name=target_field_name)
        data_bundle.set_vocab(tgt_vocab, target_field_name)

    return data_bundle
示例#4
0
    def setup(self, stage="train"):
        if stage == 'train':
            data = self.hparams.data
            # build dataset
            # indexes: the ith column of the conll file. it depends on your file and may need modification.
            loader = ConllLoader([word, pos, head], indexes=[1, 3, 6])
            train_dataset = loader._load(data.train_file)
            val_dataset = loader._load(data.val_file)
            test_dataset = loader._load(data.test_file)

            def clean_word(words):
                def clean_number(word):
                    def is_number(s):
                        try:
                            float(s)
                            return True
                        except ValueError:
                            return False
                    if is_number(word):
                        return '0'
                    else:
                        return word
                # import re
                # def clean_number(w):
                #     new_w = re.sub('[0-9]{1,}([,.]?[0-9]*)*', '0', w)
                #     return new_w
                return [clean_number(word) for word in words]

            def numerize(heads):
                return [int(head) for head in heads]

            train_dataset.apply_field(clean_word, word, new_field_name=word)
            val_dataset.apply_field(clean_word, word, new_field_name=word)
            test_dataset.apply_field(clean_word, word, new_field_name=word)
            test_dataset.apply_field(numerize, head, new_field_name=head)
            train_dataset.add_seq_len(field_name=word, new_field_name=seq_len)
            val_dataset.add_seq_len(field_name=word, new_field_name=seq_len)
            test_dataset.add_seq_len(field_name=word, new_field_name=seq_len)


            pos_vocab = Vocabulary()
            pos_vocab.from_dataset(train_dataset, field_name=pos)

            if data.wordposastoken:
                '''
                combining pos tag and word as a single token.
                Largely speaking, we build the vocabulary based on the co-occurance of (NT, 'word')
                Then, we replace all unknown word with their corresponding POS tag.
                Please refer
                "Dependency Grammar Induction with Neural Lexicalization and Big Training Data"
                for details.
                '''
                def combine(x):
                    sent = list(zip(x[pos], x[word]))
                    return [x[0] + "_" + x[1] for x in sent]

                train_dataset.apply(combine, new_field_name=word)
                val_dataset.apply(combine, new_field_name=word)
                test_dataset.apply(combine, new_field_name=word)
                word_vocab = Vocabulary(min_freq=data.min_freq)
                word_vocab.from_dataset(train_dataset, field_name=word)

                '''
                Replace the unknown word with their POS tag.
                '''

                word_vocab.add_word_lst(pos_vocab.word2idx)
                word_vocab.index_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(val_dataset, field_name=word)
                word_vocab.index_dataset(test_dataset, field_name=word)
                unk = 1

                def replace(x):
                    poses = x[pos]
                    words = x[word]
                    for i in range(len(words)):
                        # 1 stands for unk. we replace the unknown word with its POS tags.
                        if words[i] == unk:
                            pos_tag_name = poses[i]
                            words[i] = word_vocab[pos_tag_name]
                    return words

                train_dataset.apply(replace, new_field_name=word)
                val_dataset.apply(replace, new_field_name=word)
                test_dataset.apply(replace, new_field_name=word)

                if data.use_emb:
                    if data.emb_type == 'fasttext':
                        model = FastText.load(data.embedding)
                    else:
                        raise NotImplementedError
                    word_vec = model.wv
                    emb = np.random.rand(len(word_vocab), data.word_emb_size)
                    for idx, w in word_vocab.idx2word.items():
                        if "_" in w:
                            w = w.split('_')[-1]
                            emb[idx] = word_vec[w]
                    emb = torch.from_numpy(emb)
                    self.pretrained_emb = emb.to(self.device).float()

                word2pos = np.zeros(shape=(len(word_vocab),))

                # to match each token in vocabulary with its corresponding POS tag.
                for idx, w in word_vocab.idx2word.items():
                    if idx == 0:
                        continue
                    if idx == 1:
                        word2pos[1] = 1
                        continue
                    if "_" in w:
                        pos_tag_name = w.split("_")[0]
                        word2pos[idx] = pos_vocab.word2idx[pos_tag_name]
                    else:
                        word2pos[idx] = pos_vocab.word2idx[w]
                self.word2pos = torch.from_numpy(word2pos).long().to(self.device)


            # if not combine pos/word as a single token.
            else:
                # choose the create the vocabulary with fix size or based on the word frequency.
                if data.vocab_type == 'max_size':
                    word_vocab = Vocabulary(max_size=data.vocab_size)
                else:
                    word_vocab = Vocabulary(min_freq=data.min_freq)
                word_vocab.from_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(train_dataset, field_name=word)
                word_vocab.index_dataset(val_dataset, field_name=word)
                word_vocab.index_dataset(test_dataset, field_name=word)

            train_dataset.set_input(pos, word, seq_len)
            val_dataset.set_input(pos, word, seq_len)
            test_dataset.set_input(pos, word, seq_len)
            test_dataset.set_target(head)

            pos_vocab.index_dataset(train_dataset, field_name=pos)
            pos_vocab.index_dataset(val_dataset, field_name=pos)
            pos_vocab.index_dataset(test_dataset, field_name=pos)

            train_dataset_init = None

            '''
            Use external unsupervised parser's parse result as "psudo-gold-tree" to initialize our model.
            '''
            if self.hparams.train.initializer == 'external':
                # dependent on your file format.
                conll_loader = ConllLoader([word, pos, head], indexes=[1, 4, 6])
                train_dataset_init = conll_loader._load(data.external_parser)
                train_dataset_init.add_seq_len(field_name=word, new_field_name=seq_len)
                train_dataset_init.apply_field(clean_word, word, new_field_name=word)
                train_dataset_init.apply_field(numerize, head, new_field_name=head)

                if not data.wordposastoken:
                    word_vocab.index_dataset(train_dataset_init, field_name=word)
                else:
                    train_dataset_init.apply(combine, new_field_name=word)
                    word_vocab.index_dataset(train_dataset_init, field_name=word)
                    train_dataset_init.apply(replace, new_field_name=word)

                pos_vocab.index_dataset(train_dataset_init, field_name=pos)

                if self.hparams.joint_training:
                    import copy
                    train_dataset_init_for_model2 = copy.deepcopy(train_dataset_init)

                # first-order model
                if (self.hparams.model.model_name == 'NeuralDMV') or (self.hparams.model.model_name == 'LexicalizedNDMV'):
                    rule_generator = RuleGenerator1o()

                # second-order model
                elif self.hparams.model.model_name == 'SiblingNDMV':
                    rule_generator = RuleGeneratorSib()

                elif self.hparams.model.model_name == 'JointFirstSecond':
                    rule_generator = RuleGenerator1o()
                    rule_generator_for_model2 = RuleGeneratorSib()

                else:
                    raise NameError

                self.setup_init_dataset(train_dataset_init, rule_generator)

                if self.hparams.joint_training:
                    self.setup_init_dataset(train_dataset_init_for_model2, rule_generator_for_model2)


            elif self.hparams.train.initializer == 'km':
                train_dataset_init = train_dataset

            self.pos_vocab = pos_vocab
            self.word_vocab = word_vocab
            self.train_dataset = train_dataset
            self.val_dataset =  val_dataset
            self.test_dataset = test_dataset
            self.train_dataset_init = train_dataset_init
            if self.hparams.joint_training:
                self.train_dataset_init_for_model2 = train_dataset_init_for_model2

        else:
            raise NotImplementedError