def skip_aug(self, token_idxes, tokens): results = [] for token_idx in token_idxes: to_be_keep = True # Some word does not come with synonym/ antony. It will be excluded in lucky draw. if tokens[token_idx][1] in ['DT']: continue # Some words does not exisit for specific pos. Need to filter it out if self.aug_src == 'ppdb': word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1]) if word_poses is None or len(word_poses) == 0: continue have_candidate = False for word_pos in word_poses: if len( self.model.predict(tokens[token_idx][0], pos=word_pos)) > 0: have_candidate = True break if not have_candidate: to_be_keep = False if to_be_keep: results.append(token_idx) return results
def substitute(self, data): results = [] tokens = self.tokenizer(data) pos = self.model.pos_tag(tokens) aug_idxes = self._get_aug_idxes(pos) if aug_idxes is None: return data for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: results.append(token) continue word_poses = PartOfSpeech.constituent2pos(pos[i][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(pos[i][0])) else: for word_pos in word_poses: candidates.extend(self.model.predict(pos[i][0], pos=word_pos)) candidates = [c for c in candidates if c.lower() != token.lower()] if len(candidates) == 0: results.append(token) else: candidate = self.sample(candidates, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)
def read(self, model_path): with open(model_path, 'rb') as f: for line in f: line = line.decode('utf-8') if '\\ x' in line or 'xc3' in line: continue fields = line.split('|||') constituents = fields[0].strip()[1:-1].split('/') phrase = fields[1].strip() paraphrase = fields[2].strip() # filter multiple words if len(phrase.split()) != len(paraphrase.split()): continue scores = [] if len(fields) == 6: # filter equivalence word ( for PPDB v2.0 only.) # entailment = fields[5].strip() # if entailment == 'Equivalence' and self.is_synonym: # continue features = fields[3].strip().split() features = [ feature for feature in features for s in self.score_threshold if s in feature ] # filter by scheme for feature in features: scheme, score = feature.split('=') if scheme in self.score_threshold and float( score) > self.score_threshold[scheme]: scores.append((scheme, score)) # # filter by feature/ score # if len(scores) == 0: # continue if phrase not in self.dict: self.dict[phrase] = {} part_of_speeches = [ pos for con in constituents for pos in PartOfSpeech.constituent2pos(con) ] for pos in part_of_speeches: if pos not in self.dict[phrase]: self.dict[phrase][pos] = [] self.dict[phrase][pos].append({ 'phrase': phrase, 'part_of_speech': pos, 'synonym': paraphrase, 'scores': scores })
def substitute(self, data): if not data or not data.strip(): return data change_seq = 0 doc = Doc(data, self.tokenizer(data)) pos = self.model.pos_tag(doc.get_original_tokens()) aug_idxes = self._get_aug_idxes(pos) if aug_idxes is None or len(aug_idxes) == 0: if self.include_detail: return data, [] return data for aug_idx, original_token in enumerate(doc.get_original_tokens()): # Skip if no augment for word if aug_idx not in aug_idxes: continue word_poses = PartOfSpeech.constituent2pos(pos[aug_idx][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(pos[aug_idx][0])) else: for word_pos in word_poses: candidates.extend( self.model.predict(pos[aug_idx][0], pos=word_pos)) candidates = [ c for c in candidates if c.lower() != original_token.lower() ] if len(candidates) > 0: candidate = self.sample(candidates, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() substitute_token = self.align_capitalization( original_token, candidate) if aug_idx == 0: substitute_token = self.align_capitalization( original_token, substitute_token) change_seq += 1 doc.add_change_log(aug_idx, new_token=substitute_token, action=Action.SUBSTITUTE, change_seq=self.parent_change_seq + change_seq) if self.include_detail: return self.reverse_tokenizer( doc.get_augmented_tokens()), doc.get_change_logs() else: return self.reverse_tokenizer(doc.get_augmented_tokens())
def substitute(self, text): results = [] tokens = self.tokenizer(text) pos = nltk.pos_tag(tokens) aug_idxes = self._get_aug_idxes(pos) if aug_idxes is None: return text for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idxes: results.append(token) continue word_poses = PartOfSpeech.pos2wn(pos[i][1]) synets = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly synets.extend(self.model.synsets(pos[i][0], lang=self.lang)) else: for word_pos in word_poses: synets.extend( self.model.synsets(pos[i][0], pos=word_pos, lang=self.lang)) augmented_data = [] for synet in synets: candidates = [] for lema in synet.lemmas(): if self.synonyms: candidates.append(lema.name()) else: if lema.antonyms(): candidates.append(lema.antonyms()[0].name()) for candidate in candidates: if candidate.lower() != token.lower(): augmented_data.append(candidate) if len(augmented_data) == 0: results.append(token) else: candidate = self.sample(augmented_data, 1)[0] candidate = candidate.replace("_", " ").replace("-", " ").lower() results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)
def get_candidates(self, tokens, token_idx): original_token = tokens[token_idx][0] word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(tokens[token_idx][0])) else: for word_pos in word_poses: candidates.extend( self.model.predict(tokens[token_idx][0], pos=word_pos)) candidates = [ c for c in candidates if c.lower() != original_token.lower() ] return candidates
def substitute(self, text): results = [] tokens = self.tokenizer(text) pos = nltk.pos_tag(tokens) aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = [i for i, t in enumerate(tokens)] word_idxes = self.skip_aug(word_idxes, pos) aug_idexes = self.sample(word_idxes, aug_cnt) for i, token in enumerate(tokens): # Skip if no augment for word if i not in aug_idexes: results.append(token) continue word_poses = PartOfSpeech.pos2wn(pos[i][1]) synets = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly synets.extend(self.model.synsets(pos[i][0])) else: for word_pos in word_poses: synets.extend(self.model.synsets(pos[i][0], pos=word_pos)) augmented_data = [] for synet in synets: for candidate in synet.lemma_names(): if candidate.lower() != token.lower(): augmented_data.append(candidate) if len(augmented_data) == 0: results.append(token) else: candidate = self.sample(augmented_data, 1)[0] results.append(self.align_capitalization(token, candidate)) return self.reverse_tokenizer(results)
def _get_aug_idxes(self, tokens): aug_cnt = self.generate_aug_cnt(len(tokens)) word_idxes = self.pre_skip_aug(tokens, tuple_idx=0) word_idxes = self.skip_aug(word_idxes, tokens) if len(word_idxes) == 0: if self.verbose > 0: exception = WarningException( name=WarningName.OUT_OF_VOCABULARY, code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) exception.output() return None aug_idexes = [] for aug_idx in word_idxes: word_poses = PartOfSpeech.constituent2pos(tokens[aug_idx][1]) candidates = [] if word_poses is None or len(word_poses) == 0: # Use every possible words as the mapping does not defined correctly candidates.extend(self.model.predict(tokens[aug_idx][0])) else: for word_pos in word_poses: candidates.extend( self.model.predict(tokens[aug_idx][0], pos=word_pos)) candidates = [ c for c in candidates if c.lower() != tokens[aug_idx][0].lower() ] if len(candidates) > 0: candidate = self.sample(candidates, 1)[0] aug_idexes.append((aug_idx, candidate)) if len(aug_idexes) < aug_cnt: aug_cnt = len(aug_idexes) aug_idexes = self.sample(aug_idexes, aug_cnt) return aug_idexes