예제 #1
0
 def _get_aug_idxes(self, tokens):
     aug_cnt = self.generate_aug_cnt(len(tokens))
     word_idxes = [i for i, t in enumerate(tokens) if t not in self.stopwords]
     word_idxes = self.skip_aug(word_idxes, tokens)
     if len(word_idxes) == 0:
         if self.verbose > 0:
             exception = Warning(name=WarningName.OUT_OF_VOCABULARY,
                                 code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
             exception.output()
         return None
     if len(word_idxes) < aug_cnt:
         aug_cnt = len(word_idxes)
     aug_idexes = self.sample(word_idxes, aug_cnt)
     return aug_idexes
예제 #2
0
    def _validate_augment(self, data):
        if data is None or len(data) == 0:
            return [
                Warning(name=WarningName.INPUT_VALIDATION_WARNING,
                        code=WarningCode.WARNING_CODE_001,
                        msg=WarningMessage.LENGTH_IS_ZERO)
            ]

        return []
예제 #3
0
    def _get_aug_idxes(self, tokens, aug_p, mode):
        if mode == Method.CHAR:
            if len(tokens) <= self.min_char:
                return None

        aug_cnt = self.generate_aug_cnt(len(tokens), aug_p)
        idxes = [i for i, t in enumerate(tokens)]
        if mode == Method.WORD:
            idxes = [i for i in idxes if tokens[i] not in self.stopwords]
        elif mode == Method.CHAR:
            idxes = self.skip_aug(idxes, tokens)

        if len(idxes) == 0:
            if self.verbose > 0:
                exception = Warning(name=WarningName.OUT_OF_VOCABULARY,
                                    code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(idxes) < aug_cnt:
            aug_cnt = len(idxes)
        aug_idexes = self.sample(idxes, aug_cnt)
        return aug_idexes
예제 #4
0
    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = [i for i, t in enumerate(tokens) if t not in self.stopwords]
        word_idxes = self.skip_aug(word_idxes, tokens)

        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = Warning(name=WarningName.OUT_OF_VOCABULARY,
                                    code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(word_idxes) < aug_cnt:
            aug_cnt = len(word_idxes)

        aug_probs = self.model.cal_tfidf(word_idxes, tokens)
        aug_idxes = []

        # It is possible that no token is picked. So re-try
        retry_cnt = 3
        possible_idxes = word_idxes.copy()
        for _ in range(retry_cnt):
            for i, p in zip(possible_idxes, aug_probs):
                if self.prob() < p:
                    aug_idxes.append(i)
                    possible_idxes.remove(i)

                    if len(possible_idxes) == aug_cnt:
                        break

        # If still cannot pick up, random pick index regrardless probability
        if len(aug_idxes) < aug_cnt:
            aug_idxes.extend(self.sample(possible_idxes, aug_cnt-len(aug_idxes)))

        aug_idxes = self.sample(aug_idxes, aug_cnt)

        return aug_idxes