def do_train_test_split(args): logger.info("Performing train-test split for all datasets") task_dic = {"complexity": args.complexity, "eyetracking": args.eyetracking, "readability": args.readability} for task, do_task in task_dic.items(): folder = f"{args.data_dir}/{task}/train_test" if not os.path.exists(folder) and do_task: os.makedirs(folder) if task == "complexity": train, test = train_test_split(args.pc, test_size=args.test_size, random_state=args.seed) elif task == "eyetracking": if args.eyetracking_mode == "word": train, test = train_test_split_sentences( args.et, test_frac=args.test_size, sentenceid_col="sentence_id" ) elif args.eyetracking_mode == "sentence": train, test = train_test_split(args.et, test_size=args.test_size, random_state=args.seed) elif task == "readability": train, test = train_test_split( args.ra, test_size=args.test_size, random_state=args.seed, stratify=args.ra[["reading_level"]] ) save_tsv(train, f"{folder}/train.tsv") save_tsv(test, f"{folder}/test.tsv") logger.info(f"Train-test data saved in {folder}") else: if do_task: logger.info("Train-test data already exist in path, not overriding them.")
def get_et_metrics(sentences, model=None, save_path=None, load_path=None, id="model"): if load_path is not None and os.path.exists(load_path): logger.info(f"Loading predicted eye-tracking metrics from {load_path}") df = read_tsv(load_path) else: logger.info(f"Inferencing eye-tracking predictions with model {model}") # Remove all whitespaces before punctuation, to make sure that format actually # matches the one used in eye-tracking files on which the model was trained. sentences = ([{ "text": re.sub(r"\s+([^\w\s])", r"\1", s) } for s in sentences] if type(sentences[0]) is str else sentences) model = MultitaskInferencer.load(model, gpu=True, level="token") res = model.inference_from_dicts(dicts=sentences) for i, sent in enumerate(res): for j, tok in enumerate(sent): res[i][j]["sentence_id"] = i res[i][j]["token_id"] = j res = [token for sentence in res for token in sentence] df = pd.DataFrame.from_dict(res) df["context"] = [c.rstrip() for c in df["context"]] if save_path is not None: logger.info(f"Saving inferenced predictions to {save_path}") save_tsv(df, f"{save_path}/{id}_preds.tsv") return df
def evaluate_kfold(args, data_silo, processor): silos = DataSiloForCrossVal.make(data_silo, n_splits=args.folds) # Run the whole training, earlystopping to get a model, # then evaluate the model on the test set of each fold dict_preds_labels = {} for task in args.label_columns: dict_preds_labels[task] = {} dict_preds_labels[task]["preds"], dict_preds_labels[task]["labels"] = [], [] for num_fold, silo in enumerate(silos): if not args.do_eval_only: model = train_on_split(args, silo, processor, num_fold) else: model = CustomAdaptiveModel.load(f"{args.model_name}_{num_fold}", device=args.device,) model.connect_heads_with_processor(silo.processor.tasks, require_labels=True) evaluator_test = MultitaskEvaluator( data_loader=silo.get_data_loader("test"), tasks=silo.processor.tasks, device=args.device, report=False ) result = evaluator_test.eval(model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len(silo.get_data_loader("test")), num_fold=num_fold) # Exclude total loss for res in result[1:]: dict_preds_labels[res["task_name"]]["preds"].extend(res.get("preds")) dict_preds_labels[res["task_name"]]["labels"].extend(res.get("labels")) if args.save_predictions: pred_tsv = pd.DataFrame() for res in result[1:]: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds") pred_tsv[f"{res['task_name']}_labels"] = res.get("labels") save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}_{num_fold}.tsv")) args.logger.info("Final results:") for task_name, task in dict_preds_labels.items(): args.logger.info(f"__{task_name}__") metrics = token_level_regression_metrics(task["preds"], task["labels"]) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}")
def compute_corr_ranks_over_bins(args, config): logger.info( "Correlate features with task scores over various length bins...") # Compute correlation lists for all the length bins corr_ranks_per_bin = [] args.leave_nans = True for curr_binsize in range(args.start_bin, args.end_bin + 1, args.bin_step): corr_ranks = {} for data_name in config.keys(): data = read_tsv(config[data_name]["path"]) bin_data = data.loc[ (data[config[data_name]["length_bin_feat"]] >= curr_binsize - args.bin_width) & (data[config[data_name]["length_bin_feat"]] <= curr_binsize + args.bin_width), :, ] logger.info( f"Bin {curr_binsize}±{args.bin_width} examples: {len(bin_data)}" ) if args.save_binned_data: name = config[data_name]["path"].split( ".")[0] + f"_bin{curr_binsize}.tsv" logger.info( f"Saving {curr_binsize}±{args.bin_width} bin to {name}") save_tsv(bin_data, name) corr_ranks = { **corr_ranks, **(compute_corr_ranks(args, bin_data, data_name, config[data_name])) } for task_name in corr_ranks.keys(): corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation, reverse=True) corr_ranks_per_bin.append(corr_ranks) # Order first correlation lists by correlation intensity of features first_bin_ranks = corr_ranks_per_bin[0] for task in first_bin_ranks.keys(): first_bin_ranks[task].sort( key=lambda tup: -1 if np.isnan(tup[1].correlation) else tup[1].correlation, reverse=True) # Order all correlation lists based on the one for the first bin for i in range(len(corr_ranks_per_bin)): for task in corr_ranks_per_bin[i].keys(): corr_ranks_per_bin[i][task].sort(key=lambda x: [ first_bin_ranks[task].index(tup) for tup in first_bin_ranks[task] if tup[0] == x[0] ]) return corr_ranks_per_bin
def preprocess_readability_data(args): ra_dir = os.path.join(args.data_dir, RA_FOLDER) idxs, texts, reading_levels = [], [], [] for filename in os.listdir(ra_dir): if not filename.endswith(".txt"): continue name = filename.split("-")[0] with open(os.path.join(ra_dir, filename), "r") as f: label = f.readline() label = label.rstrip("\n") sentences = f.readlines() sentences = [s.rstrip("\n") for s in sentences] sentences = [s for s in sentences if s] idxs += [f"{name}-{i}" for i in range(1, len(sentences) + 1)] texts += sentences reading_levels += [label for i in range(len(sentences))] df = pd.DataFrame({"index": idxs, "text": texts, "reading_level": [l.strip() for l in reading_levels]}) out = os.path.join(args.out_dir, "readability_data.tsv") save_tsv(df, out) logger.info(f"Readability assessment data were preprocessed and saved as" f" {out} with shape {df.shape}") return df
def preprocess_complexity_data(args): # Needed to avoid making the "null" word in text a NaN pc_file = os.path.join(args.data_dir, PC_DATA) pc = pd.read_csv(pc_file, na_values=["N/A"], keep_default_na=False) # Remove duplicates pc = pc[~pc.duplicated("SENTENCE")] pc_vals_start_idx = 2 # Keep only annotations to compute agreement scores vals = pc.iloc[:, pc_vals_start_idx:] # Check if at least 10 participants agree on the complexity score agreement = [x >= args.complexity_min_agree for x in compute_agreement(vals, vals.mean(axis=1), vals.std(axis=1))] df = pd.DataFrame({"index": pc["ID"], "text": pc["SENTENCE"], "score": vals.mean(axis=1),}) if args.do_features: # Load features pc_features_file = os.path.join(args.data_dir, PC_FEATURES) pc_features = pd.read_csv(pc_features_file, sep="\t") # Concatenate PC linguistic features df = pd.concat([df.reset_index(drop=True), pc_features.reset_index(drop=True)], axis=1) # Filter by agreement df = df[agreement] out = os.path.join(args.out_dir, "complexity_data.tsv") save_tsv(df, out) logger.info(f"Perceived complexity data were preprocessed and saved as" f" {out} with shape {df.shape}") return df
def finetune_sentence_level(args): logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO) args.logger = logging.getLogger(__name__) if args.do_logfile: filehandler = logging.FileHandler( os.path.join(args.log_dir, f"{args.run_name}.log")) args.logger.addHandler(filehandler) args.logger.info(vars(args)) # Setup MLFlow ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/") ml_logger.init_experiment(experiment_name=args.experiment_name, run_name=args.run_name) set_all_seeds(seed=args.seed) args.device, args.n_gpu = initialize_device_settings(use_cuda=True) # Create a tokenizer tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name, do_lower_case=args.do_lower_case, tokenizer_class=tok_class) # Create a processor for the dataset processor = load_processor(args, tokenizer) # Create a DataSilo that loads several datasets (train/dev/test) # provides DataLoaders and calculates descriptive statistics data_silo = DataSilo(processor=processor, batch_size=args.batch_size) if args.do_feat_embeds: args.feat_size = processor.feat_size # We do cross-validation if args.folds > 1: evaluate_kfold(args, data_silo, processor) else: adapt_model = train_on_split(args, data_silo, processor) evaluator_test = MultitaskEvaluator( data_loader=data_silo.get_data_loader("test"), tasks=data_silo.processor.tasks, device=args.device) result = evaluator_test.eval(adapt_model, return_preds_and_labels=True) evaluator_test.log_results(result, "Test", steps=len( data_silo.get_data_loader("test"))) pred_tsv = pd.DataFrame() args.logger.info("Test results:") for res in result[1:]: args.logger.info(f"__{res['task_name']}__") if args.train_mode == "classification": metrics = classification_metrics(res.get("preds"), res.get("labels")) args.logger.info(metrics) else: metrics = regression_metrics(res.get("preds"), res.get("labels")) for metric in metrics.keys(): args.logger.info(f"{metric}: {metrics[metric]}") if args.save_predictions: pred_tsv[f"{res['task_name']}_preds"] = res.get("preds")[0] pred_tsv[f"{res['task_name']}_labels"] = res.get("labels")[0] if args.save_predictions: save_tsv(pred_tsv, os.path.join(args.out_dir, f"{args.run_name}.tsv")) # Load trained model and perform inference dicts = [ { "text": "The intense interest aroused in the public has now somewhat subsided." }, { "text": "The quick brown fox jumped over the lazy dog." }, ] model = MultitaskInferencer.load(args.save_dir, gpu=True, level="sentence") result = model.inference_from_dicts(dicts=dicts) args.logger.info("Inference example:") args.logger.info(result)
def get_surprisals(args): set_seed(args.seed, cuda=args.cuda) logger.info("Importing tokenizer and pre-trained model") tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" ref = args.reference_hf_model if args.reference_hf_model is not None else args.model_name_or_path model = AutoModelWithLMHead.from_pretrained(ref) # Loading a local model, we need to replace the AutoModel with the local model if args.reference_hf_model is not None: farm_lm = LanguageModel.load( args.model_name_or_path, language_model_class=args.model_class_name) # Set the underlying model to the custom loaded model # The LM head used for surprisal is the original pretrained head logger.info( f"Setting model.{model.base_model_prefix} attribute with model: {args.model_name_or_path}" ) setattr(model, model.base_model_prefix, farm_lm.model) tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name_or_path, do_lower_case=args.do_lower_case, tokenizer_class=tok_class, ) else: tokenizer = AutoTokenizer.from_pretrained(ref) device = torch.device("cuda" if args.cuda else "cpu") model.to(device) model.eval() logger.info(f"Reading sentences from {args.inputf}") if args.inputf.endswith(".tsv"): # lingcomp tsv format df = read_tsv(args.inputf) sentences = list(df["text"]) elif args.inputf.endswith(".json"): # syntaxgym test suite format sentences = get_sentences_from_json(args.inputf) elif args.inputf.endswith(".txt"): # one sentencen per line sentences = open(args.inputf, "r").read().split("\n") else: raise AttributeError( "Only .tsv, .json and .txt input files are supported.") dict_list = [] for i, sentence in tqdm(enumerate(sentences)): surprisals = get_surprisal_scores(sentence, tokenizer, model, device) if args.mode in ["token", "sentence"]: for token, token_idx, surprisal, _, _ in surprisals: dict_list.append({ "sentence_id": i + 1, "token_id": token_idx, "token": token, "surprisal": surprisal }) elif args.mode == "word": words, word_surps, word_spans = aggregate_word_level( sentence, surprisals) for j, word in enumerate(words): dict_list.append({ "start": word_spans[j]["start"], "end": word_spans[j]["end"], "context": word, "surprisal": word_surps[j], "sentence_id": i + 1, "token_id": j + 1, }) out = pd.DataFrame(dict_list) if args.mode == "sentence": surprisals = list( out.groupby("sentence_id", sort=False).sum()["surprisal"]) assert len(surprisals) == len( sentences), "Sentence-surprisal number mismatch" dict_list = [] for k, sent in enumerate(sentences): dict_list.append({ "sentence_id": k + 1, "sentence": sent, "surprisal": surprisals[k] }) out = pd.DataFrame(dict_list) logger.info( f"Surprisal values at {args.mode}-level were saved to {args.outputf}") save_tsv(out, args.outputf)
def create_preprocessed_dataset(self): if self.version in ["zuco1-nr", "zuco1-sr"]: df = read_zuco1_mat(self.mat_files_path) elif self.version == "zuco2": df = read_zuco2_mat(self.mat_files_path) else: raise AttributeError("Selected version of ZuCo does not exist.") # Clean up words since we need to rely on whitespaces for aligning # sentences with tokens. logger.info("Preprocessing values for the dataset...") df["content"] = [str(w).replace(" ", "") for w in df["content"]] word_skip = [int(v) for v in list(df["FXC"].isna())] # If FXC is NaN, it corresponds to 0 fixations df["FXC"] = df["FXC"].fillna(0) # Create new fields for the dataset word_id = [ f"{x}-{y}-{z}" for x, y, z in zip(df["task_id"], df["sent_idx"], df["word_idx"].astype("int32")) ] length = [len(str(x)) for x in df["content"]] mean_fix_dur = [] for x, y in zip(df["TRT"], df["FXC"]): if pd.isna(x) or pd.isna(y): mean_fix_dur.append(np.nan) elif y == 0: mean_fix_dur.append(0) else: mean_fix_dur.append(x / y) refix_count = [ max(x - 1, 0) if pd.notna(x) else np.nan for x in df["FXC"] ] reread_prob = [x > 1 if pd.notna(x) else np.nan for x in df["FXC"]] # Since here we do not have the selective go past time as for GECO, # we approximate it using go-past time minus gaze duration. # Note that this approximation is a lower bound in case of multiple regressions. tot_regr_from_dur = [] for x, y in zip(df["GPT"], df["GD"]): if pd.isna(x) or pd.isna(y): tot_regr_from_dur.append(np.nan) else: tot_regr_from_dur.append(max(x - y, 0)) # We do not have POS info for ZuCo corpora pos = ["UNK" for x in range(len(df))] fix_prob = [1 - x for x in word_skip] # Format taken from Hollenstein et al. 2019 "NER at First Sight" out = pd.DataFrame({ # Identifiers "participant": df["participant"], "text_id": df["task_id"], # Name of the recorded reading portion "sentence_id": df["sent_idx"], # Absolute sentence position in reading portion # AOI-level measures "word_id": word_id, "word": df["content"], "length": length, "pos": pos, # Basic measures "fix_count": df["FXC"], "fix_prob": fix_prob, "mean_fix_dur": mean_fix_dur, # Early measures "first_fix_dur": df["FFD"], "first_pass_dur": df["GD"], # Late measures "tot_fix_dur": df["TRT"], "refix_count": refix_count, "reread_prob": reread_prob, # Context measures "tot_regr_from_dur": tot_regr_from_dur, "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)], "n-1_fix_prob": ([0] + fix_prob)[:len(df)], "n+1_fix_prob": (fix_prob + [0])[1:], "n+2_fix_prob": (fix_prob + [0, 0])[2:], "n-2_fix_dur": ([0, 0] + list(df["TRT"]))[:len(df)], "n-1_fix_dur": ([0] + list(df["TRT"]))[:len(df)], "n+1_fix_dur": (list(df["TRT"]) + [0])[1:], "n+2_fix_dur": (list(df["TRT"]) + [0, 0])[2:], }) # Convert to correct data types out = out.astype(self.out_types_word) # Caching preprocessed dataset for next Processor calls save_tsv(out, self.out_preprocessed) logger.info(f"{self.version} data were preprocessed and saved as" f" {self.out_preprocessed} with shape {out.shape}") self.preprocessed_data = out
def create_preprocessed_dataset(self): df = pd.read_csv( self.data_path, usecols=DUNDEE_DATA_COLS, sep="\t", quoting=csv.QUOTE_NONE, engine="python", na_values=[""], keep_default_na=False, ) # Clean up words since we need to rely on whitespaces for aligning # sentences with tokens. df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]] logger.info("Preprocessing values for the dataset...") keep_idx = [] curr_sent_id, curr_wnum = 1, 0 curr_val = df.loc[0, "SentenceID"] curr_pp = df.loc[0, "Participant"] sent_ids, word_ids = [], [] for _, r in tqdm(df.iterrows()): # Tokens are split from punctuation for POS tagging, we need to reassemble regions. # We use WNUM to check if the token belongs to the same region. if r["WNUM"] == curr_wnum: keep_idx.append(False) continue keep_idx.append(True) curr_wnum = r["WNUM"] # Advance sentence id if r["SentenceID"] != curr_val: curr_sent_id += 1 curr_val = r["SentenceID"] # Data are ordered, so we can reset sentence indexes when switching participants if r["Participant"] != curr_pp: curr_sent_id = 1 curr_pp = r["Participant"] sent_ids.append(curr_sent_id) word_ids.append( f'{int(r["Itemno"])}-{int(r["SentenceID"])}-{int(r["ID"])}') # Filter out duplicates df = df[keep_idx] out = pd.DataFrame({ # Identifiers "participant": df["Participant"], "text_id": df["Itemno"], "sentence_id": sent_ids, # AOI-level measures "word_id": word_ids, "word": df["WORD"], "length": df["WLEN"], "pos": df["UniversalPOS"], # Basic measures "fix_count": df["nFix"], "fix_prob": df["Fix_prob"], "mean_fix_dur": df["Mean_fix_dur"], # Early measures "first_fix_dur": df["First_fix_dur"], "first_pass_dur": df["First_pass_dur"], # Late measures "tot_fix_dur": df["Tot_fix_dur"], "refix_count": df["nRefix"], "reread_prob": df["Re-read_prob"], # Context measures "tot_regr_from_dur": df["Tot_regres_from_dur"], "n-2_fix_prob": df["n-2_fix_prob"], "n-1_fix_prob": df["n-1_fix_prob"], "n+1_fix_prob": df["n+1_fix_prob"], "n+2_fix_prob": df["n+2_fix_prob"], "n-2_fix_dur": df["n-2_fix_dur"], "n-1_fix_dur": df["n-1_fix_dur"], "n+1_fix_dur": df["n+1_fix_dur"], "n+2_fix_dur": df["n+2_fix_dur"], }) # Convert to correct data types out = out.astype(self.out_types_word) # Caching preprocessed dataset for next Processor calls save_tsv(out, self.out_preprocessed) logger.info(f"Dundee data were preprocessed and saved as" f" {self.out_preprocessed} with shape {out.shape}") self.preprocessed_data = out
def create_preprocessed_dataset(self): data = pd.read_excel( self.data_path, usecols=GECO_DATA_COLS, sheet_name="DATA", na_values=GECO_NA_VALUES, keep_default_na=False, ) extra = pd.read_excel(self.materials_path, sheet_name="ALL", na_values=["N/A"], keep_default_na=False, usecols=GECO_MATERIAL_COLS) sent_ids = read_tsv(self.sentence_ids_path) logger.info("Preprocessing values for the dataset...") df = pd.merge(data, extra, how="left", on="WORD_ID") df = pd.merge(df, sent_ids, how="left", on="WORD_ID") # Clean up words since we need to rely on whitespaces for aligning # sentences with tokens. df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]] # Create new fields for the dataset text_id = [f"{x}-{y}" for x, y in zip(df["PART"], df["TRIAL"])] length = [len(str(x)) for x in df["WORD"]] # Handle the case where we don't fill NaN values mean_fix_dur = [] for x, y in zip(df["WORD_TOTAL_READING_TIME"], df["WORD_FIXATION_COUNT"]): if pd.isna(x): mean_fix_dur.append(np.nan) elif y == 0: mean_fix_dur.append(0) else: mean_fix_dur.append(x / y) refix_count = [max(x - 1, 0) for x in df["WORD_RUN_COUNT"]] reread_prob = [x > 1 for x in df["WORD_FIXATION_COUNT"]] # Handle the case where we don't fill NaN values tot_regr_from_dur = [] for x, y in zip(df["WORD_GO_PAST_TIME"], df["WORD_SELECTIVE_GO_PAST_TIME"]): if pd.isna(x) or pd.isna(y): tot_regr_from_dur.append(np.nan) else: tot_regr_from_dur.append(max(x - y, 0)) # 2050 tokens per participant do not have POS info. # We use a special UNK token for missing pos tags. pos = [ GECO_POS_MAP[x] if not pd.isnull(x) else GECO_POS_MAP["UNK"] for x in df["PART_OF_SPEECH"] ] fix_prob = [1 - x for x in df["WORD_SKIP"]] # Format taken from Hollenstein et al. 2019 "NER at First Sight" out = pd.DataFrame({ # Identifiers "participant": df["PP_NR"], "text_id": text_id, # PART-TRIAL for GECO "sentence_id": df["SENTENCE_ID"], # Absolute sentence position for GECO # AOI-level measures "word_id": df["WORD_ID"], "word": df["WORD"], "length": length, "pos": pos, # Basic measures "fix_count": df["WORD_FIXATION_COUNT"], "fix_prob": fix_prob, "mean_fix_dur": mean_fix_dur, # Early measures "first_fix_dur": df["WORD_FIRST_FIXATION_DURATION"], "first_pass_dur": df["WORD_GAZE_DURATION"], # Late measures "tot_fix_dur": df["WORD_TOTAL_READING_TIME"], "refix_count": refix_count, "reread_prob": reread_prob, # Context measures "tot_regr_from_dur": tot_regr_from_dur, "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)], "n-1_fix_prob": ([0] + fix_prob)[:len(df)], "n+1_fix_prob": (fix_prob + [0])[1:], "n+2_fix_prob": (fix_prob + [0, 0])[2:], "n-2_fix_dur": ([0, 0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)], "n-1_fix_dur": ([0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)], "n+1_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0])[1:], "n+2_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0, 0])[2:], }) # Convert to correct data types out = out.astype(self.out_types_word) # Caching preprocessed dataset for next Processor calls save_tsv(out, self.out_preprocessed) logger.info(f"GECO data were preprocessed and saved as" f" {self.out_preprocessed} with shape {out.shape}") self.preprocessed_data = out
def evaluate_model_on_suite(model_name_or_path, suite_path, save_path=None, conf_interval=0.95): """ Given a model or its predictions, computes its performances on a test suite based on the formula specified in it Args: model_name_or_path: A path to a local folder containing model files (HuggingFace or FARM format), or a model name from the HuggingFace model hub, or a path to a local TSV file containing model predictions. suite_path: A path to a local JSON file containing a suite in SyntaxGym format. save_path: If model_name_or_path is a model, its inferred predictions will be saved to this path. conf_interval: Float between 0 and 1, confidence interval computed on metric values Returns: A dataframe containing average scores across items for each condition and region, along with confidence bounds, and a dataframe containing success ratios over suite formula for each score column and prediction formula. E.g. condition_name region_number score_name metric_name mean sem count region up_conf low_conf ambig_comma 1 first_fix_dur_score sum 395 14 24 Start 365 424 ambig_comma 2 first_fix_dur_score sum 179 5 24 Verb 167 191 ambig_comma 4 first_fix_dur_score sum 228 7 24 NP/Z 213 244 ambig_comma 5 first_fix_dur_score sum 158 7 24 Verb 143 173 prediction_id prediction_formula score_column result 0 (((5;%ambig_nocomma%) > (5;%ambig_comma%))) first_fix_dur_score 0.66 1 (((5;%ambig_nocomma%) > (5;%unambig_nocomma%))) first_fix_dur_score 0.33 2 ((((5;%ambig_nocomma%) - (5;%ambig_comma%)) > ... first_fix_dur_score 0.33 """ if os.path.exists(model_name_or_path) and model_name_or_path.endswith( ".tsv"): pred_suite, df = compute_suite_et_metrics(suite_path, load_path=model_name_or_path) else: pred_suite, df = compute_suite_et_metrics(suite_path, model=model_name_or_path, save_path=save_path) # Averaging predictions across conditions, regions, scores and metric names grp = df.groupby( ["condition_name", "region_number", "score_name", "metric_name"]) avg_df = grp["metric_val"].agg(["mean", "sem", "count"]).reset_index() avg_df["region"] = [ pred_suite.region_names[i - 1] for i in avg_df.region_number ] # Compute confidence intervals avg_df["up_conf"], avg_df["low_conf"] = zip( * [confidence_intervals(r, conf_interval) for _, r in avg_df.iterrows()]) avg_df = avg_df.sort_values( ["score_name", "condition_name", "region_number"]) avg_df = avg_df[[ "condition_name", "region_number", "region", "score_name", "metric_name", "mean", "sem", "count", "up_conf", "low_conf", ]] pred_df = evaluate_suite(pred_suite) res_df = pred_df.groupby( ["prediction_id", "prediction_formula", "score_column"]).mean()["result"] res_df = res_df.reset_index().sort_values( ["score_column", "prediction_id"]) res_df = res_df[[ "prediction_id", "result", "score_column", "prediction_formula" ]] if save_path: logger.info(f"Saving dataframes to {save_path}") save_tsv(avg_df, f"{save_path}/{pred_suite.meta['name']}_avg.tsv") save_tsv(res_df, f"{save_path}/{pred_suite.meta['name']}_res.tsv") return avg_df, res_df