def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = [i for i, t in enumerate(tokens) if t not in self.stopwords] word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = Warning(name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(word_idxes) < aug_cnt: aug_cnt = len(word_idxes) aug_idexes = self.sample(word_idxes, aug_cnt) return aug_idexes
def _validate_augment(self, data): if data is None or len(data) == 0: return [ Warning(name=WarningName.INPUT_VALIDATION_WARNING, code=WarningCode.WARNING_CODE_001, msg=WarningMessage.LENGTH_IS_ZERO) ] return []
def _get_aug_idxes(self, tokens, aug_p, mode): if mode == Method.CHAR: if len(tokens) <= self.min_char: return None aug_cnt = self.generate_aug_cnt(len(tokens), aug_p) idxes = [i for i, t in enumerate(tokens)] if mode == Method.WORD: idxes = [i for i in idxes if tokens[i] not in self.stopwords] elif mode == Method.CHAR: idxes = self.skip_aug(idxes, tokens) if len(idxes) == 0: if self.verbose > 0: exception = Warning(name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(idxes) < aug_cnt: aug_cnt = len(idxes) aug_idexes = self.sample(idxes, aug_cnt) return aug_idexes
def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = [i for i, t in enumerate(tokens) if t not in self.stopwords] word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = Warning(name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None if len(word_idxes) < aug_cnt: aug_cnt = len(word_idxes) aug_probs = self.model.cal_tfidf(word_idxes, tokens) aug_idxes = [] # It is possible that no token is picked. So re-try retry_cnt = 3 possible_idxes = word_idxes.copy() for _ in range(retry_cnt): for i, p in zip(possible_idxes, aug_probs): if self.prob() < p: aug_idxes.append(i) possible_idxes.remove(i) if len(possible_idxes) == aug_cnt: break # If still cannot pick up, random pick index regrardless probability if len(aug_idxes) < aug_cnt: aug_idxes.extend(self.sample(possible_idxes, aug_cnt-len(aug_idxes))) aug_idxes = self.sample(aug_idxes, aug_cnt) return aug_idxes