def __init__(self, gold_file_path, predictions_file_path): """ Evaluates the results of a StereoSet predictions file with respect to the gold label file. Args: - gold_file_path: path, relative or absolute, to the gold file - predictions_file_path : path, relative or absolute, to the predictions file Returns: - overall, a dictionary of composite scores for intersentence and intrasentence """ # cluster ID, gold_label to sentence ID stereoset = dataloader.StereoSet(gold_file_path) self.intersentence_examples = stereoset.get_intersentence_examples() self.intrasentence_examples = stereoset.get_intrasentence_examples() self.id2term = {} self.id2gold = {} self.id2score = {} self.example2sent = {} self.domain2example = {"intersentence": defaultdict(lambda: []), "intrasentence": defaultdict(lambda: [])} with open(predictions_file_path) as f: self.predictions = json.load(f) for example in self.intrasentence_examples: for sentence in example.sentences: self.id2term[sentence.ID] = example.target self.id2gold[sentence.ID] = sentence.gold_label self.example2sent[(example.ID, sentence.gold_label)] = sentence.ID self.domain2example['intrasentence'][example.bias_type].append(example) for example in self.intersentence_examples: for sentence in example.sentences: self.id2term[sentence.ID] = example.target self.id2gold[sentence.ID] = sentence.gold_label self.example2sent[(example.ID, sentence.gold_label)] = sentence.ID self.domain2example['intersentence'][example.bias_type].append(example) for sent in self.predictions.get('intrasentence', []) + self.predictions.get('intersentence', []): self.id2score[sent['id']] = sent['score'] results = defaultdict(lambda: {}) for split in ['intrasentence', 'intersentence']: for domain in ['gender', 'profession', 'race', 'religion']: results[split][domain] = self.evaluate(self.domain2example[split][domain]) results['intersentence']['overall'] = self.evaluate(self.intersentence_examples) results['intrasentence']['overall'] = self.evaluate(self.intrasentence_examples) results['overall'] = self.evaluate(self.intersentence_examples + self.intrasentence_examples) self.results = results
def __init__(self, tokenizer, args): self.tokenizer = tokenizer filename = args.input_file dataset = dataloader.StereoSet(filename) self.emp_max_seq_length = float("-inf") self.max_seq_length = args.max_seq_length self.batch_size = args.batch_size if self.tokenizer.__class__.__name__=="XLNetTokenizer": self.prepend_text = """ In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision and denounces one of the men as a horse thief. Although his father initially slaps him for making such an accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. <eod> </s> <eos> """ self.prepend_text = None else: self.prepend_text = None intersentence_examples = dataset.get_intersentence_examples() self.preprocessed = [] for example in intersentence_examples: context = example.context if self.prepend_text is not None: context = self.prepend_text + context for sentence in example.sentences: # if self.tokenizer.__class__.__name__ in ["XLNetTokenizer", "RobertaTokenizer"]: if self.tokenizer.__class__.__name__ in ["XLNetTokenizer", "RobertaTokenizer"]: #, "BertTokenizer"]: # support legacy pretrained NSP heads! input_ids, token_type_ids = self._tokenize(context, sentence.sentence) attention_mask = [1 for _ in input_ids] self.preprocessed.append((input_ids, token_type_ids, attention_mask, sentence.ID)) else: s = f"{context} {sentence.sentence}" pad_to_max_length = self.batch_size>1 encoded_dict = self.tokenizer.encode_plus(text=context, text_pair=sentence.sentence, truncation=True, add_special_tokens=True, max_length=self.max_seq_length, truncation_strategy="longest_first", pad_to_max_length=pad_to_max_length, return_tensors="pt", return_token_type_ids=True, return_attention_mask=True, return_overflowing_tokens=False, return_special_tokens_mask=False) # prior tokenization # input_ids, position_ids, attention_mask = self._tokenize(context, sentence) input_ids = encoded_dict['input_ids'] token_type_ids = encoded_dict['token_type_ids'] attention_mask = encoded_dict['attention_mask'] self.preprocessed.append((input_ids, token_type_ids, attention_mask, sentence.ID)) print(f"Maximum sequence length found: {self.emp_max_seq_length}")
def __init__(self, pretrained_class="gpt2", no_cuda=False, batch_size=51, input_file="data/bias.json", intrasentence_model="GPT2LM", intrasentence_load_path=None, intersentence_model="ModelNSP", intersentence_load_path=None, tokenizer="GPT2Tokenizer", unconditional_start_token="<|endoftext|>", skip_intrasentence=False, skip_intersentence=False, max_seq_length=64, small=False, output_dir="predictions/"): print(f"Loading {input_file}...") self.BATCH_SIZE = batch_size filename = os.path.abspath(input_file) self.dataloader = dataloader.StereoSet(filename) self.cuda = not no_cuda self.device = "cuda" if self.cuda else "cpu" self.SKIP_INTERSENTENCE = skip_intersentence self.SKIP_INTRASENTENCE = skip_intrasentence self.UNCONDITIONAL_START_TOKEN = unconditional_start_token # store pruning information self.PRUNE_PERCENT = prune_percent self.STORE_WEIGHT_LOCATION = store_weight_location self.PRETRAINED_CLASS = pretrained_class self.TOKENIZER = tokenizer self.tokenizer = getattr(transformers, self.TOKENIZER).from_pretrained( self.PRETRAINED_CLASS) self.INTRASENTENCE_MODEL = intrasentence_model self.INTRASENTENCE_LOAD_PATH = intrasentence_load_path self.INTERSENTENCE_MODEL = intersentence_model self.INTERSENTENCE_LOAD_PATH = intersentence_load_path self.max_seq_length = max_seq_length print("---------------------------------------------------------------") print( f"{Fore.LIGHTCYAN_EX} ARGUMENTS {Style.RESET_ALL}") print( f"{Fore.LIGHTCYAN_EX}Pretrained class:{Style.RESET_ALL} {pretrained_class}") print(f"{Fore.LIGHTCYAN_EX}Unconditional Start Token: {Style.RESET_ALL} {self.UNCONDITIONAL_START_TOKEN}") print(f"{Fore.LIGHTCYAN_EX}Tokenizer:{Style.RESET_ALL} {tokenizer}") print( f"{Fore.LIGHTCYAN_EX}Skip Intrasentence:{Style.RESET_ALL} {self.SKIP_INTRASENTENCE}") print( f"{Fore.LIGHTCYAN_EX}Intrasentence Model:{Style.RESET_ALL} {self.INTRASENTENCE_MODEL}") print( f"{Fore.LIGHTCYAN_EX}Skip Intersentence:{Style.RESET_ALL} {self.SKIP_INTERSENTENCE}") print( f"{Fore.LIGHTCYAN_EX}Intersentence Model:{Style.RESET_ALL} {self.INTERSENTENCE_MODEL}") print(f"{Fore.LIGHTCYAN_EX}CUDA:{Style.RESET_ALL} {self.cuda}") print("---------------------------------------------------------------")
def main(args): filename = args.input_file dataset = dataloader.StereoSet(filename, ignore_harm=True) intrasentence_examples = dataset.get_intrasentence_examples() intersentence_examples = dataset.get_intersentence_examples() c = defaultdict(lambda: Counter()) for example in intrasentence_examples: c[example.bias_type][example.target] += 1 for example in intersentence_examples: c[example.bias_type][example.target] += 1 for domain, term in c.items(): print() print(domain) for k, v in sorted(term.items(), key=lambda x: x[1], reverse=True): print(f"{k}: {v}") print()
def __init__(self, pretrained_class="bert-large-uncased-whole-word-masking", no_cuda=False, input_file="data/bias.json", intrasentence_model="BertLM", intersentence_model="BertNextSentence", tokenizer="BertTokenizer", intersentence_load_path=None, intrasentence_load_path=None, skip_intrasentence=False, skip_intersentence=False, batch_size=1, max_seq_length=128, output_dir="predictions/", output_file="predictions.json"): print(f"Loading {input_file}...") filename = os.path.abspath(input_file) self.dataloader = dataloader.StereoSet(filename) self.cuda = not no_cuda self.device = "cuda" if self.cuda else "cpu" self.INTRASENTENCE_LOAD_PATH = intrasentence_load_path self.INTERSENTENCE_LOAD_PATH = intersentence_load_path self.SKIP_INTERSENTENCE = skip_intersentence self.SKIP_INTRASENTENCE = skip_intrasentence self.INTRASENTENCE_LOAD_PATH = intrasentence_load_path self.INTERSENTENCE_LOAD_PATH = intersentence_load_path self.PRETRAINED_CLASS = pretrained_class self.TOKENIZER = tokenizer self.tokenizer = getattr(transformers, self.TOKENIZER).from_pretrained( self.PRETRAINED_CLASS, padding_side="right") # to keep padding consistent with the other models -> improves LM score. if self.tokenizer.__class__.__name__ == "XLNetTokenizer": self.tokenizer.padding_side = "right" self.MASK_TOKEN = self.tokenizer.mask_token # Set this to be none if you don't want to batch items together! self.batch_size = batch_size self.max_seq_length = None if self.batch_size == 1 else max_seq_length self.MASK_TOKEN_IDX = self.tokenizer.encode(self.MASK_TOKEN, add_special_tokens=False) assert len(self.MASK_TOKEN_IDX) == 1 self.MASK_TOKEN_IDX = self.MASK_TOKEN_IDX[0] self.INTRASENTENCE_MODEL = intrasentence_model self.INTERSENTENCE_MODEL = intersentence_model print( "---------------------------------------------------------------") print( f"{Fore.LIGHTCYAN_EX} ARGUMENTS {Style.RESET_ALL}" ) print( f"{Fore.LIGHTCYAN_EX}Pretrained class:{Style.RESET_ALL} {pretrained_class}" ) print( f"{Fore.LIGHTCYAN_EX}Mask Token:{Style.RESET_ALL} {self.MASK_TOKEN}" ) print(f"{Fore.LIGHTCYAN_EX}Tokenizer:{Style.RESET_ALL} {tokenizer}") print( f"{Fore.LIGHTCYAN_EX}Skip Intrasentence:{Style.RESET_ALL} {self.SKIP_INTRASENTENCE}" ) print( f"{Fore.LIGHTCYAN_EX}Intrasentence Model:{Style.RESET_ALL} {self.INTRASENTENCE_MODEL}" ) print( f"{Fore.LIGHTCYAN_EX}Skip Intersentence:{Style.RESET_ALL} {self.SKIP_INTERSENTENCE}" ) print( f"{Fore.LIGHTCYAN_EX}Intersentence Model:{Style.RESET_ALL} {self.INTERSENTENCE_MODEL}" ) print(f"{Fore.LIGHTCYAN_EX}CUDA:{Style.RESET_ALL} {self.cuda}") print( "---------------------------------------------------------------")
def main(args): model_predictions = defaultdict(lambda: {}) predictions_dir = args.predictions_dir if args.predictions_dir[-1] != "/": predictions_dir = args.predictions_dir + "/" for model_file in glob(predictions_dir + "*.json"): print() print(f"Ingesting {model_file}...") with open(model_file, "r+") as f: model_preds = json.load(f) id2score = {} for p in model_preds['intersentence'] + model_preds['intrasentence']: id2score[p['id']] = p['score'] intersentence_ids = set() for p in model_preds['intersentence']: intersentence_ids.add(p['id']) pretrained_class = os.path.basename(model_file).split("_")[1] model_predictions[pretrained_class] = id2score predictions = Counter() stereoset = dataloader.StereoSet(args.gold_file) examples = stereoset.get_intrasentence_examples( ) + stereoset.get_intersentence_examples() unrelateds = set() BERT_INTERSENTENCE_WEIGHT = 35.0 GPT_INTERSENTENCE_WEIGHT = 15.0 BERT_INTRASENTENCE_WEIGHT = 1.0 GPT_INTRASENTENCE_WEIGHT = 10000.0 for example in examples: assert len(example.sentences) == 3 for (pair_a, pair_b) in [(0, 1), (1, 2), (2, 0)]: for k in ['gpt2-large', "bert-large-cased", "gpt2-medium"]: v = model_predictions[k] id_a = example.sentences[pair_a].ID id_b = example.sentences[pair_b].ID for pair_x, id_x in [(pair_a, id_a), (pair_b, id_b)]: if example.sentences[pair_x].gold_label == "unrelated": unrelateds.add(id_x) prediction_a = v[id_a] prediction_b = v[id_b] if id_a not in predictions: predictions[id_a] = 0 if id_b not in predictions: predictions[id_b] = 0 if id_a in intersentence_ids: if prediction_a == prediction_b: pass elif prediction_a > prediction_b: if 'gpt2' in k: predictions[id_a] += GPT_INTERSENTENCE_WEIGHT * ( prediction_a) else: predictions[id_a] += BERT_INTERSENTENCE_WEIGHT * ( prediction_a) else: if 'gpt2' in k: predictions[id_b] += GPT_INTERSENTENCE_WEIGHT * ( prediction_b) else: predictions[id_b] += BERT_INTERSENTENCE_WEIGHT * ( prediction_b) else: if prediction_a == prediction_b: pass elif prediction_a > prediction_b: if 'gpt2' in k: predictions[id_a] += GPT_INTRASENTENCE_WEIGHT * ( prediction_a) else: predictions[id_a] += BERT_INTRASENTENCE_WEIGHT * ( prediction_a) else: if 'gpt2' in k: predictions[id_b] += GPT_INTRASENTENCE_WEIGHT * ( prediction_b) else: predictions[id_b] += BERT_INTRASENTENCE_WEIGHT * ( prediction_b) final_predictions = {"intersentence": [], "intrasentence": []} for k, v in predictions.items(): d = {} d['id'] = k d['score'] = v if d['id'] in intersentence_ids: final_predictions['intersentence'].append(d) else: final_predictions['intrasentence'].append(d) print("Dumping results to", args.output_file) with open(args.output_file, "w+") as f: json.dump(final_predictions, f, indent=2)
def main(args): filename = args.input_file dataset = dataloader.StereoSet(filename) intrasentence_examples = dataset.get_intrasentence_examples() intersentence_examples = dataset.get_intersentence_examples() c = Counter() intrasentence = defaultdict(lambda: []) intrasentence_harm = { "neutral": 0, "stereotype": 0, "anti-stereotype": 0, "undecided": 0 } terms = { "intersentence": defaultdict(lambda: set()), "intrasentence": defaultdict(lambda: set()), "overall": set() } cats = { "intersentence": defaultdict(lambda: 0), "intrasentence": defaultdict(lambda: 0), "overall": 0 } domains_counter = Counter() for example in intrasentence_examples: terms['intrasentence'][example.bias_type].add(example.target) terms['overall'].add(example.target) c[example.bias_type] += 1 cats['overall'] += 1 cats['intrasentence'][example.bias_type] += 1 for sentence in example.sentences: # intrasentence[sentence.gold_label].append(sentence.sentence) intrasentence[example.bias_type].append(sentence.sentence) intrasentence_harm[example.harm['gold_label']] += 1 intersentence = defaultdict(lambda: []) intersentence_harm = { "neutral": 0, "stereotype": 0, "anti-stereotype": 0, "undecided": 0 } for example in intersentence_examples: context = example.context terms['intersentence'][example.bias_type].add(example.target) terms['overall'].add(example.target) cats['intersentence'][example.bias_type] += 1 cats['overall'] += 1 c[example.bias_type] += 1 for sentence in example.sentences: # intersentence[sentence.gold_label].append((context, sentence.sentence)) intersentence[example.bias_type].append( (context, sentence.sentence)) intersentence_harm[example.harm['gold_label']] += 1 print("Intrasentence!") lengths = {"intersentence": [], "intrasentence": []} for k, v in intrasentence.items(): avg_len = np.mean([len(i.split(" ")) for i in v]) print(f"Average length of {k}: ", avg_len, "words") lengths['intrasentence'].append(avg_len) # with open(f"corpus/intrasentence_{k}.txt", "w+") as f: # f.write("\n".join(v)) # print(intrasentence_harm) print(np.mean(lengths['intrasentence'])) print() print("Intersentence!") for k, v in intersentence.items(): avg_len = np.mean([len(" ".join(i).split(" ")) for i in v]) print(f"Average length of {k}: ", avg_len, "words") lengths['intersentence'].append(avg_len) # with open(f"corpus/intersentence_{k}.txt", "w+") as f: # f.write("\n".join([f"{i[0]} {i[1]}" for i in v])) # print(intersentence_harm) print(np.mean(lengths['intersentence'])) print("Overall Avg Length:", np.mean(lengths['intersentence'] + lengths['intrasentence'])) print() total = sum(c.values()) print(f"Total Examples: {total}") print(f"Number of total terms: {len(terms)}") for k, v in sorted(c.items(), key=lambda x: x[0]): print(f"{k}: {v}, {v / total}") print() print("------- TERMS ANALYSIS -------") for cat in ['intersentence', 'intrasentence']: total = 0 for domain, s in terms[cat].items(): print(f"{domain}: {len(s)}") total += len(s) print(f"{cat.capitalize()}: {total}") print() print("Overall total:", len(terms['overall'])) print() print("------- TRIPLETS ANALYSIS -------") for cat in ['intersentence', 'intrasentence']: total = 0 for domain, s in cats[cat].items(): print(f"{domain}: {s}") total += s print(f"{cat.capitalize()}: {total}") print() print("Overall total:", cats['overall']) print()
def main(args): MODEL_NAMES = [ 'bert-large-cased', 'xlnet-large-cased', 'roberta-base', 'gpt2-medium', 'xlnet-base-cased', 'roberta-large', 'gpt2-large', 'bert-base-cased', 'gpt2' ] sentence_ids = [] # a list of tuples of (pro_id, anti_id, unrelated_id) gold_file = dataloader.StereoSet(args.gold_file) intrasentence_examples = gold_file.get_intrasentence_examples() intersentence_examples = gold_file.get_intersentence_examples() examples = intrasentence_examples + intersentence_examples target_counts = Counter() for example in examples: d = {} for sentence in example.sentences: d[sentence.gold_label] = sentence d['type'] = example.bias_type d['target'] = example.target target_counts[example.target] += 1 sentence_ids.append(d) sent2score = defaultdict(lambda: dict()) for predictions_file in glob(args.input_dir + "*.json"): idx = 2 if "_" in args.input_dir else 1 model_name = predictions_file.split("_")[idx] with open(predictions_file, "r") as f: results = json.load(f) for result in results['intrasentence']: sent2score[result['id']][model_name] = result['score'] for result in results['intersentence']: sent2score[result['id']][model_name] = result['score'] count = 0.0 domains = Counter() terms_per_domain = defaultdict(lambda: Counter()) for sentence_pair in sentence_ids: l = [] for model in MODEL_NAMES: # Pro-Stereotype Case if args.type == "pro" and ( (sent2score[sentence_pair['stereotype'].ID][model] > sent2score[sentence_pair['anti-stereotype'].ID][model]) and (sent2score[sentence_pair['stereotype'].ID][model] > sent2score[sentence_pair['unrelated'].ID][model])): l.append(True) # anti-stereotype case elif args.type == "anti" and ( (sent2score[sentence_pair['anti-stereotype'].ID][model] > sent2score[sentence_pair['stereotype'].ID][model]) and (sent2score[sentence_pair['anti-stereotype'].ID][model] > sent2score[sentence_pair['unrelated'].ID][model])): l.append(True) elif args.type == "unrelated" and ( (sent2score[sentence_pair['unrelated'].ID][model] > sent2score[sentence_pair['stereotype'].ID][model]) and (sent2score[sentence_pair['unrelated'].ID][model] > sent2score[sentence_pair['anti-stereotype'].ID][model])): l.append(True) else: l.append(False) if all(l): for k, v in sentence_pair.items(): if k in ["type", "target"]: continue if args.domain_filter == None or args.domain_filter == sentence_pair[ 'type']: print(f"{k}: {v.sentence}, {v.ID}") print() count += 1.0 domains[sentence_pair['type']] += 1 terms_per_domain[sentence_pair['type']][ sentence_pair['target']] += 1 print(f"Number of clusters that models agree on: {count}") print("Breakdown by Domain:", domains) for domain in domains.keys(): print(f"Domain: {domain}") terms = terms_per_domain[domain] normalized_terms = {} for k, v in terms.items(): normalized_terms[k] = v / target_counts[k] normalized_terms = { k: v for k, v in sorted(normalized_terms.items(), key=lambda item: item[1], reverse=True) } print(normalized_terms) print()