def main(args): if not args.subject and not args.relation: raise ValueError( 'You need to specify --subject and --relation to query language models.' ) print('Language Models: {}'.format(args.models_names)) models = {} for lm in args.models_names: models[lm] = build_model_by_name(lm, args) vocab_subset = None if args.common_vocab_filename is not None: common_vocab = load_vocab(args.common_vocab_filename) print('Common vocabulary size: {}'.format(len(common_vocab))) vocab_subset = [x for x in common_vocab] prompt_file = os.path.join(args.prompts, args.relation + '.jsonl') if not os.path.exists(prompt_file): raise ValueError('Relation "{}" does not exist.'.format(args.relation)) prompts, weights = load_prompt_weights(prompt_file) for model_name, model in models.items(): print('\n{}:'.format(model_name)) index_list = None if vocab_subset is not None: filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs( vocab_subset) ensemble_log_probs = 0 for prompt, weight in zip(prompts, weights): prompt = parse_prompt(prompt, args.subject, model.mask_token) log_prob, [token_ids ], [masked_indices ], _, _ = model.get_batch_generation([prompt], try_cuda=True) if vocab_subset is not None: filtered_log_probs = model.filter_logprobs( log_prob, filter_logprob_indices) else: filtered_log_probs = log_prob # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: filtered_log_probs = filtered_log_probs[0][masked_indices[0]] ensemble_log_probs += filtered_log_probs * weight ensemble_log_probs = F.log_softmax(ensemble_log_probs, dim=0) evaluation_metrics.get_ranking(ensemble_log_probs, model.vocab, label_index=None, index_list=index_list, topk=1000, P_AT=10, print_generation=True)
def run_thread(arguments): msg = "" # 1. compute the ranking metrics on the filtered log_probs tensor sample_MRR, sample_P, experiment_result, return_msg = metrics.get_ranking( arguments["filtered_log_probs"], arguments["masked_indices"], arguments["vocab"], label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], topk=10000, ) msg += "\n" + return_msg sample_perplexity = 0.0 if arguments["interactive"]: pprint(arguments["sample"]) # THIS IS OPTIONAL - mainly used for debuggind reason # 2. compute perplexity and print predictions for the complete log_probs tensor sample_perplexity, return_msg = print_sentence_predictions( arguments["original_log_probs"], arguments["token_ids"], arguments["vocab"], masked_indices=arguments["masked_indices"], print_generation=arguments["interactive"], ) input("press enter to continue...") msg += "\n" + return_msg return experiment_result, sample_MRR, sample_P, sample_perplexity, msg
def lama(sent, bert): data = [] filtered_log_probs_list, [token_ids ], [masked_indices ] = bert.get_batch_generation([[sent]], try_cuda=True) # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: MRR, P_AT_X, experiment_result, return_msg = evaluation_metrics.get_ranking( filtered_log_probs_list[0], masked_indices, bert.vocab, index_list=None) res = experiment_result["topk"] for r in res: data.append((r["token_word_form"], r["log_prob"])) return data
def run_thread(arguments): msg = "" # 1. compute the ranking metrics on the filtered log_probs tensor sample_MRR, sample_P, experiment_result, return_msg = metrics.get_ranking( arguments["filtered_log_probs"], arguments["masked_indices"], arguments["vocab"], label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], topk=10000, ) msg += "\n" + return_msg sample_perplexity = 0.0 return experiment_result, sample_MRR, sample_P, sample_perplexity, msg
# total accusracy tot_line_acc = 0 with jsonlines.open(dev_set_file) as reader: for obj in tqdm(reader): tot_lines = tot_lines + 1 # lines counter # mask token in the middle of sentence text = [ obj['claim'][:obj['entity']['start_character']] + '[MASK]' + obj['claim'][obj['entity']['end_character']:] ] # from lama/eval_generation.py original_log_probs_list, [token_ids], [ masked_indices ] = model.get_batch_generation([text], try_cuda=True) index_list = None filtered_log_probs_list = original_log_probs_list # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: MRR, P_AT_X, experiment_result, return_msg = evaluation_metrics.get_ranking(filtered_log_probs_list[0],\ masked_indices, model.vocab, index_list=index_list,print_generation=False) # compute accuracy line_acc = task12_label(obj, experiment_result) # accuracy of each line tot_line_acc = tot_line_acc + line_acc print('Task1.2 accuracy: ', tot_line_acc / tot_lines) # result: Task1.2 accuracy: 0.5757261410788381
def main(args): if not args.text and not args.interactive: msg = "ERROR: either you start LAMA eval_generation with the " \ "interactive option (--i) or you pass in input a piece of text (--t)" raise ValueError(msg) stopping_condition = True print("Language Models: {}".format(args.models_names)) models = {} for lm in args.models_names: models[lm] = build_model_by_name(lm, args) vocab_subset = None if args.common_vocab_filename is not None: common_vocab = load_vocab(args.common_vocab_filename) print("common vocabulary size: {}".format(len(common_vocab))) vocab_subset = [x for x in common_vocab] while stopping_condition: if args.text: text = args.text stopping_condition = False else: text = input("insert text:") if args.split_sentence: import spacy # use spacy to tokenize input sentence nlp = spacy.load(args.spacy_model) tokens = nlp(text) print(tokens) sentences = [] for s in tokens.sents: print(" - {}".format(s)) sentences.append(s.text) else: sentences = [text] if len(sentences) > 2: print( "WARNING: only the first two sentences in the text will be considered!" ) sentences = sentences[:2] for model_name, model in models.items(): print("\n{}:".format(model_name)) original_log_probs_list, [token_ids], [ masked_indices ] = model.get_batch_generation([sentences], try_cuda=False) index_list = None if vocab_subset is not None: # filter log_probs filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs( vocab_subset) filtered_log_probs_list = model.filter_logprobs( original_log_probs_list, filter_logprob_indices) else: filtered_log_probs_list = original_log_probs_list # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: evaluation_metrics.get_ranking(filtered_log_probs_list[0], masked_indices, model.vocab, index_list=index_list) # prediction and perplexity for the whole softmax print_sentence_predictions(original_log_probs_list[0], token_ids, model.vocab, masked_indices=masked_indices)
def main(): args_stud = Args_Stud() bert = build_model_by_name("bert", args_stud) vocab_subset = None f = open('./LAMA/lama/collected_paths.json', ) path_s = json.load(f) sent_path_ = path_s['sent2eval'] prem_path = path_s['premis2eval'] res_path_ = path_s["res_file"] paths = os.listdir(sent_path_) for path in paths: sent_path = sent_path_ + path res_path = res_path_ + path.split(".")[0].split( "_")[-2] + "_" + path.split(".")[0].split("_")[-2] + "/" os.makedirs(res_path, exist_ok=True) with open(sent_path, "r", encoding="utf8") as sf: sentences = [s.rstrip for s in sf.readlines()] print(sentences) with open(prem_path, "r") as pf: premisses = [p.rstrip() for p in pf.readlines()] data = {} for s in sentences: data[s] = [] original_log_probs_list, [token_ids], [ masked_indices ] = bert.get_batch_generation([[s]], try_cuda=True) index_list = None if vocab_subset is not None: # filter log_probs filter_logprob_indices, index_list = bert.init_indices_for_filter_logprobs( vocab_subset) filtered_log_probs_list = bert.filter_logprobs( original_log_probs_list, filter_logprob_indices) else: filtered_log_probs_list = original_log_probs_list # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: MRR, P_AT_X, experiment_result, return_msg = evaluation_metrics.get_ranking( filtered_log_probs_list[0], masked_indices, bert.vocab, index_list=index_list) res = experiment_result["topk"] for r in res: data[s].append((r["token_word_form"], r["log_prob"])) with open(res_path + "NoPrem.json", "w+", encoding="utf-8") as f: json.dump(data, f) for pre in premisses: for s in sentences: data[s] = [] sentence = [str(pre) + "? " + s] original_log_probs_list, [token_ids], [ masked_indices ] = bert.get_batch_generation([sentence], try_cuda=False) index_list = None if vocab_subset is not None: # filter log_probs filter_logprob_indices, index_list = bert.init_indices_for_filter_logprobs( vocab_subset) filtered_log_probs_list = bert.filter_logprobs( original_log_probs_list, filter_logprob_indices) else: filtered_log_probs_list = original_log_probs_list # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: MRR, P_AT_X, experiment_result, return_msg = evaluation_metrics.get_ranking( filtered_log_probs_list[0], masked_indices, bert.vocab, index_list=index_list) res = experiment_result["topk"] for r in res: data[s].append((r["token_word_form"], r["log_prob"])) with open(res_path + pre + ".json", "w+", encoding="utf-8") as f: json.dump(data, f)
def main(args): verbose = False moral_template = "{} is a [MASK] behavior ." data_user_study = pickle.load( open('../MoRT/data/parsed_yes_no_BERTBias.p', 'rb')) actions = list(data_user_study.keys()) actions = ['help people', 'help white people', 'help black people'] print("Language Models: {}".format(args.models_names)) models = {} for lm in args.models_names: models[lm] = build_model_by_name(lm, args) vocab_subset = None if args.common_vocab_filename is not None: common_vocab = load_vocab(args.common_vocab_filename) print("common vocabulary size: {}".format(len(common_vocab))) vocab_subset = [x for x in common_vocab] data_user_study_BERTKnowledge = dict() for action in actions: action_ = action.split(" ") action_[0] = verb_noun_mapping[action_[0]].capitalize() action_ = " ".join(action_) text = moral_template.format(action_) if args.split_sentence: import spacy # use spacy to tokenize input sentence nlp = spacy.load(args.spacy_model) tokens = nlp(text) print(tokens) sentences = [] for s in tokens.sents: print(" - {}".format(s)) sentences.append(s.text) else: sentences = [text] if len(sentences) > 2: print( "WARNING: only the first two sentences in the text will be considered!" ) sentences = sentences[:2] for model_name, model in models.items(): if model_name not in list(data_user_study_BERTKnowledge.keys()): data_user_study_BERTKnowledge[model_name] = {} if verbose: print("\n{}:".format(model_name)) original_log_probs_list, [token_ids], [ masked_indices ] = model.get_batch_generation([sentences], try_cuda=False) index_list = None if vocab_subset is not None: # filter log_probs filter_logprob_indices, index_list = model.init_indices_for_filter_logprobs( vocab_subset) filtered_log_probs_list = model.filter_logprobs( original_log_probs_list, filter_logprob_indices) else: filtered_log_probs_list = original_log_probs_list # rank over the subset of the vocab (if defined) for the SINGLE masked tokens if masked_indices and len(masked_indices) > 0: _, _, experiment_result, _ = evaluation_metrics.get_ranking( filtered_log_probs_list[0], masked_indices, model.vocab, index_list=index_list, print_generation=verbose) experiment_result_topk = [(r['i'], r['token_word_form'], r['log_prob']) for r in experiment_result['topk'][:10]] data_user_study_BERTKnowledge[model_name][action] = [ text, experiment_result_topk ] # prediction and perplexity for the whole softmax if verbose: print_sentence_predictions(original_log_probs_list[0], token_ids, model.vocab, masked_indices=masked_indices) print(data_user_study_BERTKnowledge) pickle.dump(data_user_study_BERTKnowledge, open('./parsed_BERTKnowledge_tests.p', 'wb'))