def __init__( self, data_dir, data_filename, out_filename, ref_participant, text_columns=TEXT_COLUMNS, out_types_word=OUT_TYPES_WORD, out_types_sentence=OUT_TYPES_SENTENCE, nan_cols=FILLNA_COLUMNS, fillna="zero", **kwargs, ): """ data_dir: Directory where eye-tracking dataset and materials are contained. data_filename: Name of main file in the data_dir containing eye-tracking measurements. out_filename: File where the preprocessed output will be saved. text_columns: Names of columns to be treated as text during aggregation ref_participant: The name of the reference participant having annotated all examples, used for grouping and averaging scores. out_types_word: Dictionary of data types of word-level preprocessed data, with entries structured as column name : data type. out_types_sentence: Dictionary of data types of sentence-level preprocessed data, with entries structured as column name : data type. nan_cols: List of column names for columns that can possibly include NaN values. fillna: Specifies the fill-NaN strategy enacted during aggregation in get_word_data and get_sentence_data. Default: zero. Choose one among: - none: leaves NaNs as-is. - zero: fills NaNs with 0 => missing duration will count as 0 during averaging. - (min|mean|max)_participant: fills NaNs with the min|mean|max value for that token across participants. To be added in the future: - (min|mean|max)_type: fills NaNs with the min|mean|max value for that token in the whole dataset. """ self.data_dir = data_dir self.data_path = os.path.join(data_dir, data_filename) self.out_preprocessed = os.path.join(data_dir, out_filename) self.out_cleaned = os.path.join(data_dir, f"fillna_{fillna}_{out_filename}") self.text_columns = text_columns self.ref_participant = ref_participant self.out_types_word = out_types_word self.out_types_sentence = out_types_sentence self.preprocessed_data = None logger.info(f"Unused arguments: {kwargs}") if not os.path.exists(self.out_preprocessed): logger.info("Preprocessing dataset, this may take some time...") self.create_preprocessed_dataset() logger.info("Done preprocessing.") logger.info(f"Loading preprocessed data from {self.out_preprocessed}") self.preprocessed_data = read_tsv(self.out_preprocessed) if not os.path.exists(self.out_cleaned): # We fill missing value following the specified strategy logger.info(f"Filling NaN values using strategy: {fillna}") self.fill_nan_values(fillna, nan_cols) logger.info("Done filling NaNs") logger.info(f"Loading cleaned data from {self.out_cleaned}") self.cleaned_data = read_tsv(self.out_cleaned)
def read_examples_from_file_token_level( filename, score_cols, word_col="word", sentenceid_col="sentence_id", ): """Reads data file and creates the list of dict entries. Args: filename: Name of the data file in TSV format. score_cols: Dict of score columns and score names to be retained. word_col: Name of the column in the data file where the word is contained. sentenceid_col: Name of the column in the data file for the sentence id. sep: Separator of the data file """ df = read_tsv(filename) grouped_df = df.groupby(sentenceid_col) examples = [] for key, _ in grouped_df: example = {} group = grouped_df.get_group(key) words = list(group[word_col]) for score_col, task_name in score_cols.items(): example[task_name] = list(group[score_col]) example["text"] = " ".join([str(word) for word in words]) examples.append(example) return examples
def get_et_metrics(sentences, model=None, save_path=None, load_path=None, id="model"): if load_path is not None and os.path.exists(load_path): logger.info(f"Loading predicted eye-tracking metrics from {load_path}") df = read_tsv(load_path) else: logger.info(f"Inferencing eye-tracking predictions with model {model}") # Remove all whitespaces before punctuation, to make sure that format actually # matches the one used in eye-tracking files on which the model was trained. sentences = ([{ "text": re.sub(r"\s+([^\w\s])", r"\1", s) } for s in sentences] if type(sentences[0]) is str else sentences) model = MultitaskInferencer.load(model, gpu=True, level="token") res = model.inference_from_dicts(dicts=sentences) for i, sent in enumerate(res): for j, tok in enumerate(sent): res[i][j]["sentence_id"] = i res[i][j]["token_id"] = j res = [token for sentence in res for token in sentence] df = pd.DataFrame.from_dict(res) df["context"] = [c.rstrip() for c in df["context"]] if save_path is not None: logger.info(f"Saving inferenced predictions to {save_path}") save_tsv(df, f"{save_path}/{id}_preds.tsv") return df
def __init__( self, tokenizer, max_seq_len, data_dir, label_list=None, metric=None, train_filename="train.tsv", dev_filename=None, test_filename="test.tsv", dev_split=0.1, delimiter="\t", quote_char=csv.QUOTE_NONE, skiprows=None, label_column_names=[], label_names=[], multilabel=False, header=0, proxies=None, max_samples=None, text_column_name="text", **kwargs, ): self.delimiter = delimiter self.quote_char = quote_char self.skiprows = skiprows self.header = header self.max_samples = max_samples self.text_column_name = text_column_name super(TextClassificationProcessor, self).__init__( tokenizer=tokenizer, max_seq_len=max_seq_len, train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, dev_split=dev_split, data_dir=data_dir, tasks={}, proxies=proxies, ) if metric is None: metric = "classification_metrics" register_metrics(metric, classification_metrics) if multilabel: task_type = 'multilabel_classification' else: task_type = "classification" data = read_tsv(os.path.join(data_dir, train_filename)) if label_column_names and label_names: for col_name, l_name in zip(label_column_names, label_names): self.add_task( name=l_name, metric=metric, label_list=list(set(data[col_name])), label_column_name=col_name, task_type=task_type, label_name=l_name, )
def compute_corr_ranks_over_bins(args, config): logger.info( "Correlate features with task scores over various length bins...") # Compute correlation lists for all the length bins corr_ranks_per_bin = [] args.leave_nans = True for curr_binsize in range(args.start_bin, args.end_bin + 1, args.bin_step): corr_ranks = {} for data_name in config.keys(): data = read_tsv(config[data_name]["path"]) bin_data = data.loc[ (data[config[data_name]["length_bin_feat"]] >= curr_binsize - args.bin_width) & (data[config[data_name]["length_bin_feat"]] <= curr_binsize + args.bin_width), :, ] logger.info( f"Bin {curr_binsize}±{args.bin_width} examples: {len(bin_data)}" ) if args.save_binned_data: name = config[data_name]["path"].split( ".")[0] + f"_bin{curr_binsize}.tsv" logger.info( f"Saving {curr_binsize}±{args.bin_width} bin to {name}") save_tsv(bin_data, name) corr_ranks = { **corr_ranks, **(compute_corr_ranks(args, bin_data, data_name, config[data_name])) } for task_name in corr_ranks.keys(): corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation, reverse=True) corr_ranks_per_bin.append(corr_ranks) # Order first correlation lists by correlation intensity of features first_bin_ranks = corr_ranks_per_bin[0] for task in first_bin_ranks.keys(): first_bin_ranks[task].sort( key=lambda tup: -1 if np.isnan(tup[1].correlation) else tup[1].correlation, reverse=True) # Order all correlation lists based on the one for the first bin for i in range(len(corr_ranks_per_bin)): for task in corr_ranks_per_bin[i].keys(): corr_ranks_per_bin[i][task].sort(key=lambda x: [ first_bin_ranks[task].index(tup) for tup in first_bin_ranks[task] if tup[0] == x[0] ]) return corr_ranks_per_bin
def read_examples_from_file(filename, score_cols, text_col, max_samples=None, start_feat_col=None): """ start_feat_col : Column name of the first feature to be taken into account in the dataset """ df = read_tsv(filename) if max_samples: df = df.sample(max_samples) columns = [text_col] + score_cols df_filter = df[columns] raw_dict = df_filter.to_dict(orient="records") if start_feat_col: logger.info("Reading features from files...") for i, row in df.iterrows(): raw_dict[i]["features"] = row.loc[start_feat_col:].values return raw_dict
def compute_sentence_baselines(): parser = argparse.ArgumentParser() parser.add_argument("--all", action="store_true", help="Shorthand to perform all baseline evaluations") parser.add_argument("--stat", action="store_true", help="Perform evaluation with a statistic baseline.") parser.add_argument( "--svm_len", action="store_true", help="Perform evaluation with an SVM model based on sentence length.") parser.add_argument( "--svm_all", action="store_true", help= "Perform evaluation with an SVM model using all linguistic features.") parser.add_argument("--path", type=str, required=True, help="Path to the file containing the dataset.") parser.add_argument( "--text_column", type=str, required=True, help="Name of the column in dataset containing sentences.") parser.add_argument( "--score_column", type=str, required=True, help="Name of the column in dataset containing the score.") parser.add_argument( "--n_splits", default=5, type=int, help= "Number of train-test splits that should be used to compute the baseline.", ) parser.add_argument("--bin_size", default=5.0, type=float, help="Bin size to compute baseline.") parser.add_argument("--log_dir", default="logs", type=str, help="The log dir. Logs of runs will be saved there.") parser.add_argument( "--log", action="store_true", help="Set this flag if you want to log the current run to a file.") parser.add_argument("--write_tsv", action="store_true") parser.add_argument("--write_tsv_header", action="store_true") parser.add_argument("--tsv_path", default="logs/sentence_baselines.tsv") args = parser.parse_args() handlers = [logging.StreamHandler()] if args.all: args.stat, args.svm_len, args.svm_all = True, True, True # Setup logging to file if args.log: name = args.path.split("/")[-1].split(".")[0] filehandler = logging.FileHandler( os.path.join(args.log_dir, f"baselines_{name}.log")) filehandler.setLevel(logging.INFO) handlers.append(filehandler) logging.basicConfig( format="%(asctime)s %(levelname)s %(name)s %(message)s", datefmt="%d-%m-%y %H:%M:%S", level=logging.INFO, handlers=handlers, ) args.logger = logging.getLogger(__name__) args.logger.info(vars(args)) # Load data data = read_tsv(args.path) args.data_size = len(data) if "n_tokens" not in data.columns: raise AttributeError( "Run preprocess.py with --do_features option to enable baseline computations." ) if args.svm_len: avg_scores = compute_scores_crossval(args, data, length_svm) log_scores(args, length_svm, avg_scores) if args.write_tsv: write_scores(args, length_svm, avg_scores, write_head=args.write_tsv_header) if args.svm_all: args.feat_start_idx = data.columns.get_loc("n_tokens") avg_scores = compute_scores_crossval(args, data, ling_feat_svm) log_scores(args, ling_feat_svm, avg_scores) if args.write_tsv: write_scores(args, ling_feat_svm, avg_scores) if args.stat: # Round values to nearest bin data["n_tokens"] = [ int(round(x / args.bin_size) * args.bin_size) for x in data["n_tokens"] ] avg_scores = compute_scores_crossval(args, data, baseline) log_scores(args, baseline, avg_scores) if args.write_tsv: write_scores(args, baseline, avg_scores)
def get_surprisals(args): set_seed(args.seed, cuda=args.cuda) logger.info("Importing tokenizer and pre-trained model") tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer" ref = args.reference_hf_model if args.reference_hf_model is not None else args.model_name_or_path model = AutoModelWithLMHead.from_pretrained(ref) # Loading a local model, we need to replace the AutoModel with the local model if args.reference_hf_model is not None: farm_lm = LanguageModel.load( args.model_name_or_path, language_model_class=args.model_class_name) # Set the underlying model to the custom loaded model # The LM head used for surprisal is the original pretrained head logger.info( f"Setting model.{model.base_model_prefix} attribute with model: {args.model_name_or_path}" ) setattr(model, model.base_model_prefix, farm_lm.model) tokenizer = CustomTokenizer.load( pretrained_model_name_or_path=args.model_name_or_path, do_lower_case=args.do_lower_case, tokenizer_class=tok_class, ) else: tokenizer = AutoTokenizer.from_pretrained(ref) device = torch.device("cuda" if args.cuda else "cpu") model.to(device) model.eval() logger.info(f"Reading sentences from {args.inputf}") if args.inputf.endswith(".tsv"): # lingcomp tsv format df = read_tsv(args.inputf) sentences = list(df["text"]) elif args.inputf.endswith(".json"): # syntaxgym test suite format sentences = get_sentences_from_json(args.inputf) elif args.inputf.endswith(".txt"): # one sentencen per line sentences = open(args.inputf, "r").read().split("\n") else: raise AttributeError( "Only .tsv, .json and .txt input files are supported.") dict_list = [] for i, sentence in tqdm(enumerate(sentences)): surprisals = get_surprisal_scores(sentence, tokenizer, model, device) if args.mode in ["token", "sentence"]: for token, token_idx, surprisal, _, _ in surprisals: dict_list.append({ "sentence_id": i + 1, "token_id": token_idx, "token": token, "surprisal": surprisal }) elif args.mode == "word": words, word_surps, word_spans = aggregate_word_level( sentence, surprisals) for j, word in enumerate(words): dict_list.append({ "start": word_spans[j]["start"], "end": word_spans[j]["end"], "context": word, "surprisal": word_surps[j], "sentence_id": i + 1, "token_id": j + 1, }) out = pd.DataFrame(dict_list) if args.mode == "sentence": surprisals = list( out.groupby("sentence_id", sort=False).sum()["surprisal"]) assert len(surprisals) == len( sentences), "Sentence-surprisal number mismatch" dict_list = [] for k, sent in enumerate(sentences): dict_list.append({ "sentence_id": k + 1, "sentence": sent, "surprisal": surprisals[k] }) out = pd.DataFrame(dict_list) logger.info( f"Surprisal values at {args.mode}-level were saved to {args.outputf}") save_tsv(out, args.outputf)
def create_preprocessed_dataset(self): data = pd.read_excel( self.data_path, usecols=GECO_DATA_COLS, sheet_name="DATA", na_values=GECO_NA_VALUES, keep_default_na=False, ) extra = pd.read_excel(self.materials_path, sheet_name="ALL", na_values=["N/A"], keep_default_na=False, usecols=GECO_MATERIAL_COLS) sent_ids = read_tsv(self.sentence_ids_path) logger.info("Preprocessing values for the dataset...") df = pd.merge(data, extra, how="left", on="WORD_ID") df = pd.merge(df, sent_ids, how="left", on="WORD_ID") # Clean up words since we need to rely on whitespaces for aligning # sentences with tokens. df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]] # Create new fields for the dataset text_id = [f"{x}-{y}" for x, y in zip(df["PART"], df["TRIAL"])] length = [len(str(x)) for x in df["WORD"]] # Handle the case where we don't fill NaN values mean_fix_dur = [] for x, y in zip(df["WORD_TOTAL_READING_TIME"], df["WORD_FIXATION_COUNT"]): if pd.isna(x): mean_fix_dur.append(np.nan) elif y == 0: mean_fix_dur.append(0) else: mean_fix_dur.append(x / y) refix_count = [max(x - 1, 0) for x in df["WORD_RUN_COUNT"]] reread_prob = [x > 1 for x in df["WORD_FIXATION_COUNT"]] # Handle the case where we don't fill NaN values tot_regr_from_dur = [] for x, y in zip(df["WORD_GO_PAST_TIME"], df["WORD_SELECTIVE_GO_PAST_TIME"]): if pd.isna(x) or pd.isna(y): tot_regr_from_dur.append(np.nan) else: tot_regr_from_dur.append(max(x - y, 0)) # 2050 tokens per participant do not have POS info. # We use a special UNK token for missing pos tags. pos = [ GECO_POS_MAP[x] if not pd.isnull(x) else GECO_POS_MAP["UNK"] for x in df["PART_OF_SPEECH"] ] fix_prob = [1 - x for x in df["WORD_SKIP"]] # Format taken from Hollenstein et al. 2019 "NER at First Sight" out = pd.DataFrame({ # Identifiers "participant": df["PP_NR"], "text_id": text_id, # PART-TRIAL for GECO "sentence_id": df["SENTENCE_ID"], # Absolute sentence position for GECO # AOI-level measures "word_id": df["WORD_ID"], "word": df["WORD"], "length": length, "pos": pos, # Basic measures "fix_count": df["WORD_FIXATION_COUNT"], "fix_prob": fix_prob, "mean_fix_dur": mean_fix_dur, # Early measures "first_fix_dur": df["WORD_FIRST_FIXATION_DURATION"], "first_pass_dur": df["WORD_GAZE_DURATION"], # Late measures "tot_fix_dur": df["WORD_TOTAL_READING_TIME"], "refix_count": refix_count, "reread_prob": reread_prob, # Context measures "tot_regr_from_dur": tot_regr_from_dur, "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)], "n-1_fix_prob": ([0] + fix_prob)[:len(df)], "n+1_fix_prob": (fix_prob + [0])[1:], "n+2_fix_prob": (fix_prob + [0, 0])[2:], "n-2_fix_dur": ([0, 0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)], "n-1_fix_dur": ([0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)], "n+1_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0])[1:], "n+2_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0, 0])[2:], }) # Convert to correct data types out = out.astype(self.out_types_word) # Caching preprocessed dataset for next Processor calls save_tsv(out, self.out_preprocessed) logger.info(f"GECO data were preprocessed and saved as" f" {self.out_preprocessed} with shape {out.shape}") self.preprocessed_data = out
def main(): parser = argparse.ArgumentParser() parser.add_argument("--all", action="store_true", help="Shorthand to perform all analysis steps.") parser.add_argument( "--config_path", type=str, default=None, help="Path to the config json file used for linguistic analysis." "By default uses the DEFAULT_CONFIG specified in this file.", ) parser.add_argument("--out_dir", type=str, default="logs/feature_analysis", help="Directory in which results will be saved.") parser.add_argument( "--do_feat_corr_ranks", action="store_true", help="Compute correlation ranks between features and task scores.") parser.add_argument( "--do_feat_svr_ranks", action="store_true", help="Compute SVR coefficient ranks between features and task scores.", ) parser.add_argument("--do_compare_corr_ranks", action="store_true") parser.add_argument("--do_rankings_correlation", action="store_true") parser.add_argument("--do_feat_corr_ranks_over_bins", action="store_true") parser.add_argument( "--start_bin", type=int, default=10, help= "The starting size bin for which feature correlation should be computed.", ) parser.add_argument( "--end_bin", type=int, default=35, help= "The ending size bin for which feature correlation should be computed." ) parser.add_argument( "--bin_step", type=int, default=5, help="The step size to be taken from start bin to end bin.") parser.add_argument( "--bin_width", type=int, default=1, help= "The +- interval in which scores are considered to be part of the same bin.", ) parser.add_argument( "--overwrite_output_files", action="store_true", help= "Specifies that existing output files should be overwritten by new ones." "By default, results are appended to existing files.", ) parser.add_argument( "--save_binned_data", action="store_true", help="If specified, saves the binned data in tsv format.") args = parser.parse_args() args.leave_nans = False if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) if args.config_path is None: config = DEFAULT_CONFIG else: with open(args.config_path, "r") as c: config = json.load(c) args.write_mode = "w" if args.overwrite_output_files else "a+" corr_ranks = {} svr_ranks = {} if args.all: args.do_feat_svr_ranks = True args.do_feat_corr_ranks, args.do_compare_corr_ranks = True, True args.do_rankings_correlation, args.do_feat_corr_ranks_over_bins = True, True for data_name in config.keys(): data = read_tsv(config[data_name]["path"]) corr_ranks = { **corr_ranks, **(compute_corr_ranks(args, data, data_name, config[data_name])) } if args.do_feat_svr_ranks: svr_ranks = { **svr_ranks, **(compute_svr_ranks(args, data, data_name, config[data_name])) } for task_name in corr_ranks.keys(): corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation, reverse=True) if args.do_feat_svr_ranks: svr_ranks[task_name].sort(key=lambda tup: tup[1], reverse=True) if args.do_feat_corr_ranks: print_ranks(args, corr_ranks) if args.do_feat_svr_ranks: print_ranks(args, svr_ranks, rtype="svr") if args.do_compare_corr_ranks: if len(corr_ranks.keys()) < 2: raise AttributeError("At least two tasks should be specified to " "compare correlation ranks.") diff_corr = compare_corr_ranks(args, corr_ranks, TARGET_TASK) for task_name in diff_corr.keys(): diff_corr[task_name].sort(key=lambda tup: abs(tup[1]), reverse=True) print_diff_corr_ranks(args, diff_corr, TARGET_TASK) if args.do_rankings_correlation: if len(corr_ranks.keys()) < 2: raise AttributeError("At least two tasks should be specified to " "compare correlation ranks.") if not args.do_compare_corr_ranks: raise AttributeError( "Correlation rank differences should be computed to correlate them." ) rankings_correlation(args, diff_corr, TARGET_TASK) if args.do_feat_corr_ranks_over_bins: if args.start_bin is None or args.end_bin is None: raise AttributeError( "start_bin and end_bin argument should be specified " "for feature_corr_ranks_over_bins option.") ranks_per_bin = compute_corr_ranks_over_bins(args, config) print_corr_ranks_over_bins(args, ranks_per_bin)