def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ keys = ['citedPaperId', 'citingPaperId', 'excerptCitationIntents'] for k in outputs.copy(): if k not in keys: outputs.pop(k) return json.dumps(outputs, cls=JsonFloatEncoder) + "\n"
def dump_line(self, outputs: JsonDict) -> str: # pylint: disable=no-self-use """ If you don't want your outputs in JSON-lines format you can override this function to output them differently. """ keys = ['citation_id', 'prediction', 'probabilities', 'citation_text'] for k in outputs.copy(): if k not in keys: outputs.pop(k) return json.dumps(outputs, cls=JsonFloatEncoder) + "\n"
def attack_from_json(self, inputs: JsonDict = None) -> JsonDict: _volatile_json_ = inputs.copy() raw_instance = self.predictor.json_to_labeled_instances(inputs)[0] raw_tokens = list(map(lambda x: x.text, self.spacy.tokenize(inputs[self.f2c]))) # Select words that can be changed sids_to_change = [] nbr_dct = defaultdict(lambda: []) for i in range(len(raw_tokens)): if raw_tokens[i] not in self.ignore_tokens: word = raw_tokens[i] nbrs = self.searcher.search(word) nbrs = [nbr for nbr in nbrs if nbr not in self.forbidden_tokens] if len(nbrs) > 0: sids_to_change.append(i) nbr_dct[i] = nbrs # max number of tokens that can be changed max_change_num = min(self.max_change_num(len(raw_tokens)), len(sids_to_change)) # Construct adversarial instances adv_jsons = [] for i in range(self.search_num): adv_tokens = [ele for ele in raw_tokens] word_sids = random.choices(sids_to_change, k=max_change_num) for word_sid in word_sids: adv_tokens[word_sid] = random.choice(nbr_dct[word_sid]) _volatile_json_[self.f2c] = " ".join(adv_tokens) adv_jsons.append(_volatile_json_.copy()) # Checking attacking status, early stop successful = False results = self.predictor.predict_batch_json(adv_jsons) for i, result in enumerate(results): adv_instance = self.predictor._json_to_instance(adv_jsons[i]) adv_instance = self.predictor.predictions_to_labeled_instances( adv_instance, result)[0] if adv_instance[self.f2a].label != raw_instance[self.f2a].label: successful = True break adv_tokens = adv_jsons[i][self.f2c].split(" ") outputs = result return sanitize({ "adv": adv_tokens, "raw": raw_tokens, "outputs": outputs, "success": 1 if successful else 0 })
def attack_from_json( self, inputs: JsonDict = None, ) -> JsonDict: # we reuse volatile_json to avoid deepcopy of the dict each time a new # instance is created, which is rather time consuming. # !!! MUST BE CAREFUL since volatile_json will change through the code. _volatile_json_ = inputs.copy() raw_instance = self.predictor.json_to_labeled_instances(inputs)[0] # raw_tokens = list(map(lambda x: x.text, self.spacy.tokenize(inputs[self.f2c]))) raw_tokens = inputs[self.f2c].split(" ") # Select words that can be changed sids_to_change = [] nbr_dct = defaultdict(lambda: []) for i in range(len(raw_tokens)): if raw_tokens[i] not in self.ignore_tokens: word = raw_tokens[i] nbrs = self.searcher.search(word) nbrs = [nbr for nbr in nbrs if nbr not in self.forbidden_tokens] if len(nbrs) > 0: sids_to_change.append(i) nbr_dct[i] = nbrs # 1. Replace each word with <UNK> and other candidate words # 2. Generate all sentences, then concatenate them into a # list for batch forwarding _jsons = [] # concatenate all jsons _offsets = {} # {sid: [start_offset, number_of_jsons]} for sid in sids_to_change: tmp_jsons = [] # first element is the raw sentence _volatile_json_[self.f2c] = " ".join(raw_tokens) tmp_jsons.append(_volatile_json_.copy()) # second element is the UNK sentence tmp_tokens = copy.copy(raw_tokens) tmp_tokens[sid] = '[UNK]' if self.use_bert else DEFAULT_OOV_TOKEN _volatile_json_[self.f2c] = " ".join(tmp_tokens) tmp_jsons.append(_volatile_json_.copy()) # starting from the third one are modified sentences for nbr in nbr_dct[sid]: tmp_tokens = copy.copy(raw_tokens) tmp_tokens[sid] = nbr _volatile_json_[self.f2c] = " ".join(tmp_tokens) tmp_jsons.append(_volatile_json_.copy()) _offsets[sid] = (len(_jsons), len(tmp_jsons)) _jsons.extend(tmp_jsons) # ugly if len(_jsons) == 0: return sanitize({ "adv": raw_tokens, "raw": raw_tokens, "outputs": self.predictor.predict_json(inputs), "success": 0 }) _results = self.predictor.predict_batch_json(_jsons, fast=True) # Compute the word saliency repl_dct = {} # {idx: "the replaced word"} pwws_dct = {} for sid in sids_to_change: _start, _num = _offsets[sid] results = _results[_start:_start + _num] probs = np.array([result['probs'] for result in results]) true_probs = probs[:, np.argmax(probs[0])] raw_prob = true_probs[0] oov_prob = true_probs[1] other_probs = true_probs[2:] repl_dct[sid] = nbr_dct[sid][np.argmin(other_probs)] pwws_dct[sid] = np.max(raw_prob - other_probs) * np.exp(raw_prob - oov_prob) # max number of tokens that can be changed max_change_num = min(self.max_change_num(len(raw_tokens)), len(sids_to_change)) final_tokens = [ele for ele in raw_tokens] sorted_pwws = sorted(pwws_dct.items(), key=lambda x: x[1], reverse=True) successful = False result = None for i in range(max_change_num): sid = sorted_pwws[i][0] final_tokens[sid] = repl_dct[sid] _volatile_json_[self.f2c] = " ".join(final_tokens) final_instance = self.predictor._json_to_instance(_volatile_json_) result = self.predictor.predict_instance(final_instance) final_instance = self.predictor.predictions_to_labeled_instances( final_instance, result)[0] if final_instance[self.f2a].label != raw_instance[self.f2a].label: successful = True break return sanitize({ "adv": final_tokens, "raw": raw_tokens, "outputs": result, "success": 1 if successful else 0 })
def attack_from_json(self, inputs: JsonDict = None) -> JsonDict: _volatile_json_ = inputs.copy() raw_tokens = inputs[self.f2c].split(" ") # print(list(self.lm_constraints.keys())[1]) # print(allenutil.as_sentence(raw_tokens)) if self.lm_constraints: lm_filters = self.lm_constraints[allenutil.as_sentence(raw_tokens)] # print(lm_filters) # pre-compute some variables for later operations self.ram_pool.clear() legal_sids = [] nbr_dct = {} for i in range(len(raw_tokens)): if raw_tokens[i] not in self.ignore_tokens: lucky_dog = raw_tokens[i] # use the original word cands = self.searcher.search(lucky_dog) cands = [ ele for ele in cands if ele not in self.forbidden_tokens ] if self.lm_constraints: ram_append("before_lm", len(cands)) cands = [ ele for ele in cands if ele in lm_filters[str(i)] ] ram_append("after_lm", len(cands)) if len(cands) > 0: legal_sids.append(i) nbr_dct[i] = cands if self.lm_constraints: print("LM constraints:", round(100 * sum(ram_read("after_lm")) /sum(ram_read("before_lm")), 2)) self.ram_pool['legal_sids'] = legal_sids self.ram_pool['nbr_dct'] = nbr_dct self.ram_pool['volatile_json'] = _volatile_json_ self.ram_pool['raw_tokens'] = raw_tokens adv_tokens = raw_tokens.copy() gid = -1 success = False if len(legal_sids) != 0: # initialzie the population P = [ {"individual": raw_tokens, "fitness": 1e-6, "success": 0} for _ in range(self.num_population) ] for gid in range(self.num_generation): fitnesses = [ele['fitness'] for ele in P] best = P[np.argmax(fitnesses)] print("generation ", gid, ":", "topk: ", flt2str(sorted(fitnesses, reverse=True)[:3], ":4.3f", cat=", "), "mean: ", round(np.mean(fitnesses), 3), "median: ", round(np.median(fitnesses), 3), ) if best['success']: return sanitize({ "adv": best['individual'], "raw": raw_tokens, "outputs": best['result'], "success": 1, "generation": gid + 1 }) else: P = self.evolve(P) if len(legal_sids) != 0: adv_tokens = best['individual'] else: adv_tokens = raw_tokens _volatile_json_[self.f2c] = " ".join(adv_tokens) result = self.predictor.predict_json(_volatile_json_) raw_instance = self.predictor._json_to_labeled_instance(inputs) final_instance = self.predictor._json_to_instance(_volatile_json_) final_instance = self.predictor.predictions_to_labeled_instances(final_instance, result)[0] success = raw_instance[self.f2a].label != final_instance[self.f2a].label return sanitize({ "adv": adv_tokens, "raw": raw_tokens, "outputs": result, "success": success, "generation": gid + 1 })
def modified_copy(jsonDict: JsonDict, key, value): ret = jsonDict.copy() ret[key] = as_sentence(value) return ret