def __init__(
     self,
     data_dir,
     data_filename,
     out_filename,
     ref_participant,
     text_columns=TEXT_COLUMNS,
     out_types_word=OUT_TYPES_WORD,
     out_types_sentence=OUT_TYPES_SENTENCE,
     nan_cols=FILLNA_COLUMNS,
     fillna="zero",
     **kwargs,
 ):
     """
     data_dir: Directory where eye-tracking dataset and materials are contained.
     data_filename: Name of main file in the data_dir containing eye-tracking measurements.
     out_filename: File where the preprocessed output will be saved.
     text_columns: Names of columns to be treated as text during aggregation
     ref_participant: The name of the reference participant having annotated all examples,
         used for grouping and averaging scores.
     out_types_word: Dictionary of data types of word-level preprocessed data,
         with entries structured as column name : data type.
     out_types_sentence: Dictionary of data types of sentence-level preprocessed data,
         with entries structured as column name : data type.
     nan_cols: List of column names for columns that can possibly include NaN values.
     fillna: Specifies the fill-NaN strategy enacted during aggregation in get_word_data and get_sentence_data. Default: zero.
         Choose one among:
             - none: leaves NaNs as-is.
             - zero: fills NaNs with 0 => missing duration will count as 0 during averaging.
             - (min|mean|max)_participant: fills NaNs with the min|mean|max value for that token across participants.
         To be added in the future:
             - (min|mean|max)_type: fills NaNs with the min|mean|max value for that token in the whole dataset.
     """
     self.data_dir = data_dir
     self.data_path = os.path.join(data_dir, data_filename)
     self.out_preprocessed = os.path.join(data_dir, out_filename)
     self.out_cleaned = os.path.join(data_dir,
                                     f"fillna_{fillna}_{out_filename}")
     self.text_columns = text_columns
     self.ref_participant = ref_participant
     self.out_types_word = out_types_word
     self.out_types_sentence = out_types_sentence
     self.preprocessed_data = None
     logger.info(f"Unused arguments: {kwargs}")
     if not os.path.exists(self.out_preprocessed):
         logger.info("Preprocessing dataset, this may take some time...")
         self.create_preprocessed_dataset()
         logger.info("Done preprocessing.")
     logger.info(f"Loading preprocessed data from {self.out_preprocessed}")
     self.preprocessed_data = read_tsv(self.out_preprocessed)
     if not os.path.exists(self.out_cleaned):
         # We fill missing value following the specified strategy
         logger.info(f"Filling NaN values using strategy: {fillna}")
         self.fill_nan_values(fillna, nan_cols)
         logger.info("Done filling NaNs")
     logger.info(f"Loading cleaned data from {self.out_cleaned}")
     self.cleaned_data = read_tsv(self.out_cleaned)
Пример #2
0
def read_examples_from_file_token_level(
    filename,
    score_cols,
    word_col="word",
    sentenceid_col="sentence_id",
):
    """Reads data file and creates the list of dict entries.
    Args:
        filename: Name of the data file in TSV format.
        score_cols: Dict of score columns and score names to be retained.
        word_col: Name of the column in the data file where the word is contained.
        sentenceid_col: Name of the column in the data file for the sentence id.
        sep: Separator of the data file
    """
    df = read_tsv(filename)
    grouped_df = df.groupby(sentenceid_col)
    examples = []
    for key, _ in grouped_df:
        example = {}
        group = grouped_df.get_group(key)
        words = list(group[word_col])
        for score_col, task_name in score_cols.items():
            example[task_name] = list(group[score_col])
        example["text"] = " ".join([str(word) for word in words])
        examples.append(example)
    return examples
Пример #3
0
def get_et_metrics(sentences,
                   model=None,
                   save_path=None,
                   load_path=None,
                   id="model"):
    if load_path is not None and os.path.exists(load_path):
        logger.info(f"Loading predicted eye-tracking metrics from {load_path}")
        df = read_tsv(load_path)
    else:
        logger.info(f"Inferencing eye-tracking predictions with model {model}")
        # Remove all whitespaces before punctuation, to make sure that format actually
        # matches the one used in eye-tracking files on which the model was trained.
        sentences = ([{
            "text": re.sub(r"\s+([^\w\s])", r"\1", s)
        } for s in sentences] if type(sentences[0]) is str else sentences)
        model = MultitaskInferencer.load(model, gpu=True, level="token")
        res = model.inference_from_dicts(dicts=sentences)
        for i, sent in enumerate(res):
            for j, tok in enumerate(sent):
                res[i][j]["sentence_id"] = i
                res[i][j]["token_id"] = j
        res = [token for sentence in res for token in sentence]
        df = pd.DataFrame.from_dict(res)
        df["context"] = [c.rstrip() for c in df["context"]]
        if save_path is not None:
            logger.info(f"Saving inferenced predictions to {save_path}")
            save_tsv(df, f"{save_path}/{id}_preds.tsv")
    return df
Пример #4
0
    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        label_list=None,
        metric=None,
        train_filename="train.tsv",
        dev_filename=None,
        test_filename="test.tsv",
        dev_split=0.1,
        delimiter="\t",
        quote_char=csv.QUOTE_NONE,
        skiprows=None,
        label_column_names=[],
        label_names=[],
        multilabel=False,
        header=0,
        proxies=None,
        max_samples=None,
        text_column_name="text",
        **kwargs,
    ):
        self.delimiter = delimiter
        self.quote_char = quote_char
        self.skiprows = skiprows
        self.header = header
        self.max_samples = max_samples
        self.text_column_name = text_column_name

        super(TextClassificationProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies,
        )
        if metric is None:
            metric = "classification_metrics"
            register_metrics(metric, classification_metrics)
        if multilabel:
            task_type = 'multilabel_classification'
        else:
            task_type = "classification"
        data = read_tsv(os.path.join(data_dir, train_filename))
        if label_column_names and label_names:
            for col_name, l_name in zip(label_column_names, label_names):
                self.add_task(
                    name=l_name,
                    metric=metric,
                    label_list=list(set(data[col_name])),
                    label_column_name=col_name,
                    task_type=task_type,
                    label_name=l_name,
                )
Пример #5
0
def compute_corr_ranks_over_bins(args, config):
    logger.info(
        "Correlate features with task scores over various length bins...")
    # Compute correlation lists for all the length bins
    corr_ranks_per_bin = []
    args.leave_nans = True
    for curr_binsize in range(args.start_bin, args.end_bin + 1, args.bin_step):
        corr_ranks = {}
        for data_name in config.keys():
            data = read_tsv(config[data_name]["path"])
            bin_data = data.loc[
                (data[config[data_name]["length_bin_feat"]] >= curr_binsize -
                 args.bin_width)
                & (data[config[data_name]["length_bin_feat"]] <= curr_binsize +
                   args.bin_width), :, ]
            logger.info(
                f"Bin {curr_binsize}±{args.bin_width} examples: {len(bin_data)}"
            )
            if args.save_binned_data:
                name = config[data_name]["path"].split(
                    ".")[0] + f"_bin{curr_binsize}.tsv"
                logger.info(
                    f"Saving {curr_binsize}±{args.bin_width} bin to {name}")
                save_tsv(bin_data, name)
            corr_ranks = {
                **corr_ranks,
                **(compute_corr_ranks(args, bin_data, data_name, config[data_name]))
            }
        for task_name in corr_ranks.keys():
            corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation,
                                       reverse=True)
        corr_ranks_per_bin.append(corr_ranks)
    # Order first correlation lists by correlation intensity of features
    first_bin_ranks = corr_ranks_per_bin[0]
    for task in first_bin_ranks.keys():
        first_bin_ranks[task].sort(
            key=lambda tup: -1
            if np.isnan(tup[1].correlation) else tup[1].correlation,
            reverse=True)
    # Order all correlation lists based on the one for the first bin
    for i in range(len(corr_ranks_per_bin)):
        for task in corr_ranks_per_bin[i].keys():
            corr_ranks_per_bin[i][task].sort(key=lambda x: [
                first_bin_ranks[task].index(tup)
                for tup in first_bin_ranks[task] if tup[0] == x[0]
            ])
    return corr_ranks_per_bin
Пример #6
0
def read_examples_from_file(filename,
                            score_cols,
                            text_col,
                            max_samples=None,
                            start_feat_col=None):
    """ start_feat_col : Column name of the first feature to be taken into account in the dataset """
    df = read_tsv(filename)
    if max_samples:
        df = df.sample(max_samples)
    columns = [text_col] + score_cols
    df_filter = df[columns]
    raw_dict = df_filter.to_dict(orient="records")
    if start_feat_col:
        logger.info("Reading features from files...")
        for i, row in df.iterrows():
            raw_dict[i]["features"] = row.loc[start_feat_col:].values
    return raw_dict
Пример #7
0
def compute_sentence_baselines():
    parser = argparse.ArgumentParser()
    parser.add_argument("--all",
                        action="store_true",
                        help="Shorthand to perform all baseline evaluations")
    parser.add_argument("--stat",
                        action="store_true",
                        help="Perform evaluation with a statistic baseline.")
    parser.add_argument(
        "--svm_len",
        action="store_true",
        help="Perform evaluation with an SVM model based on sentence length.")
    parser.add_argument(
        "--svm_all",
        action="store_true",
        help=
        "Perform evaluation with an SVM model using all linguistic features.")
    parser.add_argument("--path",
                        type=str,
                        required=True,
                        help="Path to the file containing the dataset.")
    parser.add_argument(
        "--text_column",
        type=str,
        required=True,
        help="Name of the column in dataset containing sentences.")
    parser.add_argument(
        "--score_column",
        type=str,
        required=True,
        help="Name of the column in dataset containing the score.")
    parser.add_argument(
        "--n_splits",
        default=5,
        type=int,
        help=
        "Number of train-test splits that should be used to compute the baseline.",
    )
    parser.add_argument("--bin_size",
                        default=5.0,
                        type=float,
                        help="Bin size to compute baseline.")
    parser.add_argument("--log_dir",
                        default="logs",
                        type=str,
                        help="The log dir. Logs of runs will be saved there.")
    parser.add_argument(
        "--log",
        action="store_true",
        help="Set this flag if you want to log the current run to a file.")
    parser.add_argument("--write_tsv", action="store_true")
    parser.add_argument("--write_tsv_header", action="store_true")
    parser.add_argument("--tsv_path", default="logs/sentence_baselines.tsv")
    args = parser.parse_args()
    handlers = [logging.StreamHandler()]
    if args.all:
        args.stat, args.svm_len, args.svm_all = True, True, True
    # Setup logging to file
    if args.log:
        name = args.path.split("/")[-1].split(".")[0]
        filehandler = logging.FileHandler(
            os.path.join(args.log_dir, f"baselines_{name}.log"))
        filehandler.setLevel(logging.INFO)
        handlers.append(filehandler)

    logging.basicConfig(
        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
        datefmt="%d-%m-%y %H:%M:%S",
        level=logging.INFO,
        handlers=handlers,
    )
    args.logger = logging.getLogger(__name__)
    args.logger.info(vars(args))
    # Load data
    data = read_tsv(args.path)
    args.data_size = len(data)
    if "n_tokens" not in data.columns:
        raise AttributeError(
            "Run preprocess.py with --do_features option to enable baseline computations."
        )
    if args.svm_len:
        avg_scores = compute_scores_crossval(args, data, length_svm)
        log_scores(args, length_svm, avg_scores)
        if args.write_tsv:
            write_scores(args,
                         length_svm,
                         avg_scores,
                         write_head=args.write_tsv_header)
    if args.svm_all:
        args.feat_start_idx = data.columns.get_loc("n_tokens")
        avg_scores = compute_scores_crossval(args, data, ling_feat_svm)
        log_scores(args, ling_feat_svm, avg_scores)
        if args.write_tsv:
            write_scores(args, ling_feat_svm, avg_scores)
    if args.stat:
        # Round values to nearest bin
        data["n_tokens"] = [
            int(round(x / args.bin_size) * args.bin_size)
            for x in data["n_tokens"]
        ]
        avg_scores = compute_scores_crossval(args, data, baseline)
        log_scores(args, baseline, avg_scores)
        if args.write_tsv:
            write_scores(args, baseline, avg_scores)
def get_surprisals(args):
    set_seed(args.seed, cuda=args.cuda)
    logger.info("Importing tokenizer and pre-trained model")
    tok_class = None if not args.model_class_name else f"{args.model_class_name}Tokenizer"
    ref = args.reference_hf_model if args.reference_hf_model is not None else args.model_name_or_path
    model = AutoModelWithLMHead.from_pretrained(ref)
    # Loading a local model, we need to replace the AutoModel with the local model
    if args.reference_hf_model is not None:
        farm_lm = LanguageModel.load(
            args.model_name_or_path,
            language_model_class=args.model_class_name)
        # Set the underlying model to the custom loaded model
        # The LM head used for surprisal is the original pretrained head
        logger.info(
            f"Setting model.{model.base_model_prefix} attribute with model: {args.model_name_or_path}"
        )
        setattr(model, model.base_model_prefix, farm_lm.model)
        tokenizer = CustomTokenizer.load(
            pretrained_model_name_or_path=args.model_name_or_path,
            do_lower_case=args.do_lower_case,
            tokenizer_class=tok_class,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(ref)
    device = torch.device("cuda" if args.cuda else "cpu")
    model.to(device)
    model.eval()
    logger.info(f"Reading sentences from {args.inputf}")
    if args.inputf.endswith(".tsv"):  # lingcomp tsv format
        df = read_tsv(args.inputf)
        sentences = list(df["text"])
    elif args.inputf.endswith(".json"):  # syntaxgym test suite format
        sentences = get_sentences_from_json(args.inputf)
    elif args.inputf.endswith(".txt"):  # one sentencen per line
        sentences = open(args.inputf, "r").read().split("\n")
    else:
        raise AttributeError(
            "Only .tsv, .json and .txt input files are supported.")
    dict_list = []
    for i, sentence in tqdm(enumerate(sentences)):
        surprisals = get_surprisal_scores(sentence, tokenizer, model, device)
        if args.mode in ["token", "sentence"]:
            for token, token_idx, surprisal, _, _ in surprisals:
                dict_list.append({
                    "sentence_id": i + 1,
                    "token_id": token_idx,
                    "token": token,
                    "surprisal": surprisal
                })
        elif args.mode == "word":
            words, word_surps, word_spans = aggregate_word_level(
                sentence, surprisals)
            for j, word in enumerate(words):
                dict_list.append({
                    "start": word_spans[j]["start"],
                    "end": word_spans[j]["end"],
                    "context": word,
                    "surprisal": word_surps[j],
                    "sentence_id": i + 1,
                    "token_id": j + 1,
                })
    out = pd.DataFrame(dict_list)
    if args.mode == "sentence":
        surprisals = list(
            out.groupby("sentence_id", sort=False).sum()["surprisal"])
        assert len(surprisals) == len(
            sentences), "Sentence-surprisal number mismatch"
        dict_list = []
        for k, sent in enumerate(sentences):
            dict_list.append({
                "sentence_id": k + 1,
                "sentence": sent,
                "surprisal": surprisals[k]
            })
        out = pd.DataFrame(dict_list)
    logger.info(
        f"Surprisal values at {args.mode}-level were saved to {args.outputf}")
    save_tsv(out, args.outputf)
 def create_preprocessed_dataset(self):
     data = pd.read_excel(
         self.data_path,
         usecols=GECO_DATA_COLS,
         sheet_name="DATA",
         na_values=GECO_NA_VALUES,
         keep_default_na=False,
     )
     extra = pd.read_excel(self.materials_path,
                           sheet_name="ALL",
                           na_values=["N/A"],
                           keep_default_na=False,
                           usecols=GECO_MATERIAL_COLS)
     sent_ids = read_tsv(self.sentence_ids_path)
     logger.info("Preprocessing values for the dataset...")
     df = pd.merge(data, extra, how="left", on="WORD_ID")
     df = pd.merge(df, sent_ids, how="left", on="WORD_ID")
     # Clean up words since we need to rely on whitespaces for aligning
     # sentences with tokens.
     df["WORD"] = [str(w).replace(" ", "") for w in df["WORD"]]
     # Create new fields for the dataset
     text_id = [f"{x}-{y}" for x, y in zip(df["PART"], df["TRIAL"])]
     length = [len(str(x)) for x in df["WORD"]]
     # Handle the case where we don't fill NaN values
     mean_fix_dur = []
     for x, y in zip(df["WORD_TOTAL_READING_TIME"],
                     df["WORD_FIXATION_COUNT"]):
         if pd.isna(x):
             mean_fix_dur.append(np.nan)
         elif y == 0:
             mean_fix_dur.append(0)
         else:
             mean_fix_dur.append(x / y)
     refix_count = [max(x - 1, 0) for x in df["WORD_RUN_COUNT"]]
     reread_prob = [x > 1 for x in df["WORD_FIXATION_COUNT"]]
     # Handle the case where we don't fill NaN values
     tot_regr_from_dur = []
     for x, y in zip(df["WORD_GO_PAST_TIME"],
                     df["WORD_SELECTIVE_GO_PAST_TIME"]):
         if pd.isna(x) or pd.isna(y):
             tot_regr_from_dur.append(np.nan)
         else:
             tot_regr_from_dur.append(max(x - y, 0))
     # 2050 tokens per participant do not have POS info.
     # We use a special UNK token for missing pos tags.
     pos = [
         GECO_POS_MAP[x] if not pd.isnull(x) else GECO_POS_MAP["UNK"]
         for x in df["PART_OF_SPEECH"]
     ]
     fix_prob = [1 - x for x in df["WORD_SKIP"]]
     # Format taken from Hollenstein et al. 2019 "NER at First Sight"
     out = pd.DataFrame({
         # Identifiers
         "participant":
         df["PP_NR"],
         "text_id":
         text_id,  # PART-TRIAL for GECO
         "sentence_id":
         df["SENTENCE_ID"],  # Absolute sentence position for GECO
         # AOI-level measures
         "word_id":
         df["WORD_ID"],
         "word":
         df["WORD"],
         "length":
         length,
         "pos":
         pos,
         # Basic measures
         "fix_count":
         df["WORD_FIXATION_COUNT"],
         "fix_prob":
         fix_prob,
         "mean_fix_dur":
         mean_fix_dur,
         # Early measures
         "first_fix_dur":
         df["WORD_FIRST_FIXATION_DURATION"],
         "first_pass_dur":
         df["WORD_GAZE_DURATION"],
         # Late measures
         "tot_fix_dur":
         df["WORD_TOTAL_READING_TIME"],
         "refix_count":
         refix_count,
         "reread_prob":
         reread_prob,
         # Context measures
         "tot_regr_from_dur":
         tot_regr_from_dur,
         "n-2_fix_prob": ([0, 0] + fix_prob)[:len(df)],
         "n-1_fix_prob": ([0] + fix_prob)[:len(df)],
         "n+1_fix_prob": (fix_prob + [0])[1:],
         "n+2_fix_prob": (fix_prob + [0, 0])[2:],
         "n-2_fix_dur":
         ([0, 0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)],
         "n-1_fix_dur":
         ([0] + list(df["WORD_TOTAL_READING_TIME"]))[:len(df)],
         "n+1_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0])[1:],
         "n+2_fix_dur": (list(df["WORD_TOTAL_READING_TIME"]) + [0, 0])[2:],
     })
     # Convert to correct data types
     out = out.astype(self.out_types_word)
     # Caching preprocessed dataset for next Processor calls
     save_tsv(out, self.out_preprocessed)
     logger.info(f"GECO data were preprocessed and saved as"
                 f" {self.out_preprocessed} with shape {out.shape}")
     self.preprocessed_data = out
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--all",
                        action="store_true",
                        help="Shorthand to perform all analysis steps.")
    parser.add_argument(
        "--config_path",
        type=str,
        default=None,
        help="Path to the config json file used for linguistic analysis."
        "By default uses the DEFAULT_CONFIG specified in this file.",
    )
    parser.add_argument("--out_dir",
                        type=str,
                        default="logs/feature_analysis",
                        help="Directory in which results will be saved.")
    parser.add_argument(
        "--do_feat_corr_ranks",
        action="store_true",
        help="Compute correlation ranks between features and task scores.")
    parser.add_argument(
        "--do_feat_svr_ranks",
        action="store_true",
        help="Compute SVR coefficient ranks between features and task scores.",
    )
    parser.add_argument("--do_compare_corr_ranks", action="store_true")
    parser.add_argument("--do_rankings_correlation", action="store_true")
    parser.add_argument("--do_feat_corr_ranks_over_bins", action="store_true")
    parser.add_argument(
        "--start_bin",
        type=int,
        default=10,
        help=
        "The starting size bin for which feature correlation should be computed.",
    )
    parser.add_argument(
        "--end_bin",
        type=int,
        default=35,
        help=
        "The ending size bin for which feature correlation should be computed."
    )
    parser.add_argument(
        "--bin_step",
        type=int,
        default=5,
        help="The step size to be taken from start bin to end bin.")
    parser.add_argument(
        "--bin_width",
        type=int,
        default=1,
        help=
        "The +- interval in which scores are considered to be part of the same bin.",
    )
    parser.add_argument(
        "--overwrite_output_files",
        action="store_true",
        help=
        "Specifies that existing output files should be overwritten by new ones."
        "By default, results are appended to existing files.",
    )
    parser.add_argument(
        "--save_binned_data",
        action="store_true",
        help="If specified, saves the binned data in tsv format.")
    args = parser.parse_args()
    args.leave_nans = False
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    if args.config_path is None:
        config = DEFAULT_CONFIG
    else:
        with open(args.config_path, "r") as c:
            config = json.load(c)
    args.write_mode = "w" if args.overwrite_output_files else "a+"
    corr_ranks = {}
    svr_ranks = {}
    if args.all:
        args.do_feat_svr_ranks = True
        args.do_feat_corr_ranks, args.do_compare_corr_ranks = True, True
        args.do_rankings_correlation, args.do_feat_corr_ranks_over_bins = True, True
    for data_name in config.keys():
        data = read_tsv(config[data_name]["path"])
        corr_ranks = {
            **corr_ranks,
            **(compute_corr_ranks(args, data, data_name, config[data_name]))
        }
        if args.do_feat_svr_ranks:
            svr_ranks = {
                **svr_ranks,
                **(compute_svr_ranks(args, data, data_name, config[data_name]))
            }
    for task_name in corr_ranks.keys():
        corr_ranks[task_name].sort(key=lambda tup: tup[1].correlation,
                                   reverse=True)
        if args.do_feat_svr_ranks:
            svr_ranks[task_name].sort(key=lambda tup: tup[1], reverse=True)
    if args.do_feat_corr_ranks:
        print_ranks(args, corr_ranks)
    if args.do_feat_svr_ranks:
        print_ranks(args, svr_ranks, rtype="svr")
    if args.do_compare_corr_ranks:
        if len(corr_ranks.keys()) < 2:
            raise AttributeError("At least two tasks should be specified to "
                                 "compare correlation ranks.")
        diff_corr = compare_corr_ranks(args, corr_ranks, TARGET_TASK)
        for task_name in diff_corr.keys():
            diff_corr[task_name].sort(key=lambda tup: abs(tup[1]),
                                      reverse=True)
        print_diff_corr_ranks(args, diff_corr, TARGET_TASK)
    if args.do_rankings_correlation:
        if len(corr_ranks.keys()) < 2:
            raise AttributeError("At least two tasks should be specified to "
                                 "compare correlation ranks.")
        if not args.do_compare_corr_ranks:
            raise AttributeError(
                "Correlation rank differences should be computed to correlate them."
            )
        rankings_correlation(args, diff_corr, TARGET_TASK)
    if args.do_feat_corr_ranks_over_bins:
        if args.start_bin is None or args.end_bin is None:
            raise AttributeError(
                "start_bin and end_bin argument should be specified "
                "for feature_corr_ranks_over_bins option.")
        ranks_per_bin = compute_corr_ranks_over_bins(args, config)
        print_corr_ranks_over_bins(args, ranks_per_bin)