Пример #1
0
def load_bias(dataset_name: str, is_train=True) -> Dict[str, np.ndarray]:
    """Loads the output of our bias-only model

  Note that since this produces per-token output, it is only valid on data with the
  same tokenization as our annotated data.
  """
    if dataset_name == "location":
        cache_dir = TRIVIAQA_CP_LOCATION_FILTERED_BIAS
    elif dataset_name == "person":
        cache_dir = TRIVIAQA_CP_PERSON_FILTERED_BIAS
    else:
        raise ValueError(dataset_name)

    part_name = "train" if is_train else "dev"
    src = join(cache_dir, "%s.pkl" % part_name)

    if not exists(src):
        key = (dataset_name, part_name)
        if key not in TRIVIAQA_CP_BIAS_FILE_IDS:
            raise RuntimeError()
        logging.info("Downloading TriviaQA-CP bias for %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(TRIVIAQA_CP_BIAS_FILE_IDS[key],
                                     src,
                                     progress_bar=False)

    return py_utils.load_pickle(src)
Пример #2
0
def load_triviaqa_cp(dataset_name: str, part: str) -> List[Dict]:
    """Load the official TriviaQA-CP dataset, needed for evaluation"""
    src_name = "train" if (part == "train") else "dev"
    src = join(config.TRIVIAQA_CP_SOURCE, src_name + ".json")
    if not exists(src):
        logging.info("Download TriviaQA-CP %s to %s" % (src_name, src))
        py_utils.download_from_drive(TRIVIAQA_CP_FILE_IDS[src_name], src, True)

    return triviaqa_cp_loader.load_triviaqa_cp(src, dataset_name, part)
Пример #3
0
def load_annotated_squad(dataset_name) -> List[AnnotatedSquadParagraph]:
    """Loads SQuAD data that has been tokenized and tagged by CoreNLP"""

    if dataset_name not in DATASETS:
        raise ValueError("Invalid dataset %s" % dataset_name)
    src = join(config.SQUAD_CORENLP, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Download pre-processed SQuAD %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(ANNOTATED_SQUAD_FILE_IDS[dataset_name],
                                     src)
    return py_utils.load_pickle(src)
Пример #4
0
def load_annotated_triviaqa(is_train: bool) -> List[AnnotatedTriviaQaExample]:
    """Loads TriviaQA data that has been tokenized and tagged by CoreNLP"""
    dataset_name = "train" if is_train else "dev"
    src = join(config.TRIVIAQA_CP_CORENLP, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Download pre-processed TriviaQA %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(
            TRIVIAQA_CP_CORENLP_FILE_IDS[dataset_name], src, progress_bar=True)

    logging.info("Loading CoreNLP TriviaQA %s..." % dataset_name)
    return py_utils.load_pickle(src)
Пример #5
0
def load_bias(dataset_name, filtered=False) -> Dict[str, np.ndarray]:
    """Loads the output of our bias-only model

  Note that since this produces per-token output, it is only valid on data with the
  same tokenization as our annotated data.
  """
    if filtered:
        bias_ids = SQUAD_FILTERED_BIAS_FILE_IDS
        output_dir = SQUAD_TFIDF_FILTERED_BIAS
    else:
        bias_ids = SQUAD_BIAS_FILE_IDS
        output_dir = SQUAD_TFIDF_BIAS

    if dataset_name not in bias_ids:
        raise ValueError("No bias for %s" % dataset_name)
    src = join(output_dir, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Downloading SQuAD bias for %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(bias_ids[dataset_name], src)
    return py_utils.load_pickle(src)
Пример #6
0
def load_bias(dataset_name) -> Dict[str, np.ndarray]:
  """Load dictionary of example_id->bias where bias is a length 3 array
  of log-probabilities"""

  if dataset_name not in MNLI_BIAS_DRIVE_IDS:
    raise ValueError(dataset_name)
  bias_src = join(config.MNLI_WORD_OVERLAP_BIAS, dataset_name + ".pkl")
  if not exists(bias_src):
    logging.info("Downloading MNLI bias to %s..." % bias_src)
    py_utils.download_from_drive(MNLI_BIAS_DRIVE_IDS[dataset_name], bias_src)

  bias = py_utils.load_pickle(bias_src)
  for k, v in bias.items():
    # Convert from entail vs non-entail to 3-way classes by splitting non-entail
    # to neutral and contradict
    bias[k] = np.array([
      v[0] - np.log(2.),
      v[1],
      v[0] - np.log(2.),
    ])
  return bias