예제 #1
0
파일: wordnet.py 프로젝트: natuan/nlpaug
    def substitute(self, text):
        results = []

        tokens = self.tokenizer(text)

        pos = nltk.pos_tag(tokens)

        aug_idxes = self._get_aug_idxes(pos)
        if aug_idxes is None:
            return text

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idxes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.pos2wn(pos[i][1])
            synets = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                synets.extend(self.model.synsets(pos[i][0], lang=self.lang))
            else:
                for word_pos in word_poses:
                    synets.extend(
                        self.model.synsets(pos[i][0],
                                           pos=word_pos,
                                           lang=self.lang))

            augmented_data = []
            for synet in synets:
                candidates = []
                for lema in synet.lemmas():
                    if self.synonyms:
                        candidates.append(lema.name())
                    else:
                        if lema.antonyms():
                            candidates.append(lema.antonyms()[0].name())

                for candidate in candidates:
                    if candidate.lower() != token.lower():
                        augmented_data.append(candidate)

            if len(augmented_data) == 0:
                results.append(token)
            else:
                candidate = self.sample(augmented_data, 1)[0]
                candidate = candidate.replace("_", " ").replace("-",
                                                                " ").lower()
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)
예제 #2
0
    def substitute(self, text):
        results = []

        tokens = self.tokenizer(text)

        pos = nltk.pos_tag(tokens)

        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = [i for i, t in enumerate(tokens)]
        word_idxes = self.skip_aug(word_idxes, pos)
        aug_idexes = self.sample(word_idxes, aug_cnt)

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idexes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.pos2wn(pos[i][1])
            synets = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                synets.extend(self.model.synsets(pos[i][0]))
            else:
                for word_pos in word_poses:
                    synets.extend(self.model.synsets(pos[i][0], pos=word_pos))

            augmented_data = []
            for synet in synets:
                for candidate in synet.lemma_names():
                    if candidate.lower() != token.lower():
                        augmented_data.append(candidate)

            if len(augmented_data) == 0:
                results.append(token)
            else:
                candidate = self.sample(augmented_data, 1)[0]
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)
예제 #3
0
파일: wordnet.py 프로젝트: sainiudit/nlpaug
    def substitute(self, data):
        results = []

        tokens = self.tokenizer(data)
        pos = self.model.pos_tag(tokens)

        aug_idxes = self._get_aug_idxes(pos)
        if aug_idxes is None:
            return data

        for i, token in enumerate(tokens):
            # Skip if no augment for word
            if i not in aug_idxes:
                results.append(token)
                continue

            word_poses = PartOfSpeech.pos2wn(pos[i][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(pos[i][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(
                        self.model.predict(pos[i][0], pos=word_pos))

            candidates = [c for c in candidates if c.lower() != token.lower()]

            if len(candidates) == 0:
                results.append(token)
            else:
                candidate = self.sample(candidates, 1)[0]
                candidate = candidate.replace("_", " ").replace("-",
                                                                " ").lower()
                results.append(self.align_capitalization(token, candidate))

        return self.reverse_tokenizer(results)