예제 #1
0
def load_bias(dataset_name: str, is_train=True) -> Dict[str, np.ndarray]:
    """Loads the output of our bias-only model

  Note that since this produces per-token output, it is only valid on data with the
  same tokenization as our annotated data.
  """
    if dataset_name == "location":
        cache_dir = TRIVIAQA_CP_LOCATION_FILTERED_BIAS
    elif dataset_name == "person":
        cache_dir = TRIVIAQA_CP_PERSON_FILTERED_BIAS
    else:
        raise ValueError(dataset_name)

    part_name = "train" if is_train else "dev"
    src = join(cache_dir, "%s.pkl" % part_name)

    if not exists(src):
        key = (dataset_name, part_name)
        if key not in TRIVIAQA_CP_BIAS_FILE_IDS:
            raise RuntimeError()
        logging.info("Downloading TriviaQA-CP bias for %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(TRIVIAQA_CP_BIAS_FILE_IDS[key],
                                     src,
                                     progress_bar=False)

    return py_utils.load_pickle(src)
예제 #2
0
파일: squad.py 프로젝트: kiminh/debias
def load_annotated_squad(dataset_name) -> List[AnnotatedSquadParagraph]:
    """Loads SQuAD data that has been tokenized and tagged by CoreNLP"""

    if dataset_name not in DATASETS:
        raise ValueError("Invalid dataset %s" % dataset_name)
    src = join(config.SQUAD_CORENLP, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Download pre-processed SQuAD %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(ANNOTATED_SQUAD_FILE_IDS[dataset_name],
                                     src)
    return py_utils.load_pickle(src)
예제 #3
0
def load_annotated_triviaqa(is_train: bool) -> List[AnnotatedTriviaQaExample]:
    """Loads TriviaQA data that has been tokenized and tagged by CoreNLP"""
    dataset_name = "train" if is_train else "dev"
    src = join(config.TRIVIAQA_CP_CORENLP, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Download pre-processed TriviaQA %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(
            TRIVIAQA_CP_CORENLP_FILE_IDS[dataset_name], src, progress_bar=True)

    logging.info("Loading CoreNLP TriviaQA %s..." % dataset_name)
    return py_utils.load_pickle(src)
예제 #4
0
def get_squad_tfidf_features(dataset_name, pos_filter):
    """Gets the tfidf features we used to train the bias-only model

  :param dataset_name: Name of SQuAD dataset to get features for
  :param pos_filter: POS filtered TF-IDF scores or not
  :return: Dictionary of question_id -> per-word array of TT-IDF scores
  """
    if pos_filter:
        root = SQUAD_FILTERED_TFIDF_FEATURES
    else:
        root = SQUAD_TFIDF_FEATURES
    src = join(root, dataset_name + ".pkl")
    if not exists(src):
        build_squad_tfidf_features(pos_filter)
    return py_utils.load_pickle(src)
예제 #5
0
파일: squad.py 프로젝트: kiminh/debias
def load_bias(dataset_name, filtered=False) -> Dict[str, np.ndarray]:
    """Loads the output of our bias-only model

  Note that since this produces per-token output, it is only valid on data with the
  same tokenization as our annotated data.
  """
    if filtered:
        bias_ids = SQUAD_FILTERED_BIAS_FILE_IDS
        output_dir = SQUAD_TFIDF_FILTERED_BIAS
    else:
        bias_ids = SQUAD_BIAS_FILE_IDS
        output_dir = SQUAD_TFIDF_BIAS

    if dataset_name not in bias_ids:
        raise ValueError("No bias for %s" % dataset_name)
    src = join(output_dir, "%s.pkl" % dataset_name)
    if not exists(src):
        logging.info("Downloading SQuAD bias for %s to %s" %
                     (dataset_name, src))
        py_utils.download_from_drive(bias_ids[dataset_name], src)
    return py_utils.load_pickle(src)
예제 #6
0
파일: train_bert.py 프로젝트: kiminh/debias
def load_bias(dataset_name) -> Dict[str, np.ndarray]:
  """Load dictionary of example_id->bias where bias is a length 3 array
  of log-probabilities"""

  if dataset_name not in MNLI_BIAS_DRIVE_IDS:
    raise ValueError(dataset_name)
  bias_src = join(config.MNLI_WORD_OVERLAP_BIAS, dataset_name + ".pkl")
  if not exists(bias_src):
    logging.info("Downloading MNLI bias to %s..." % bias_src)
    py_utils.download_from_drive(MNLI_BIAS_DRIVE_IDS[dataset_name], bias_src)

  bias = py_utils.load_pickle(bias_src)
  for k, v in bias.items():
    # Convert from entail vs non-entail to 3-way classes by splitting non-entail
    # to neutral and contradict
    bias[k] = np.array([
      v[0] - np.log(2.),
      v[1],
      v[0] - np.log(2.),
    ])
  return bias
예제 #7
0
def get_predictions(path, part="hans", bach_size=128, sample=None, n_processes=None, cache=False):
  output = join(path, "%s-prediction.pkl" % part)
  if sample is None and exists(output) and cache:
    return py_utils.load_pickle(output)

  logging.info("Computing predictions for %s on %s..." % (path, part))
  logging.info("Loading model...")
  model_dir = ModelDir(path)
  model = model_dir.get_model()
  tokenizer = model.get_tokenizer()
  all_examples = load_eval_set(part, sample)
  all_examples = mnli.tokenize_examples(all_examples, tokenizer, n_processes)

  all_examples.sort(key=lambda x: len(x.premise))

  logging.info("Setup data...")
  voc = set()
  for ex in all_examples:
    voc.update(ex.premise)
    voc.update(ex.hypothesis)

  model.set_vocab(voc)

  with tf.Session(graph=tf.Graph()) as sess:
    ds = mnli.make_dataset(all_examples, shuffle=False)
    fn = model.tensorize_fn()

    ds = ds.map(fn)
    ds = ds.padded_batch(bach_size, ds.output_shapes)
    ds.prefetch(5)
    it = ds.make_initializable_iterator()

    next_op = it.get_next()
    logits = model.apply(False, next_op, None)
    pred_op = tf.nn.softmax(logits)

    logging.info("Initializing...")
    if sess is None:
      sess = tf.Session()
    sess.run(tf.local_variables_initializer())
    sess.run(tf.tables_initializer())
    sess.run(it.initializer)

    logging.info("Loading checkpoint...")
    saver = tf.train.Saver()
    saver.restore(sess, model_dir.get_latest_checkpoint())

    predictions = []
    pbar = tqdm(desc="classify", total=len(all_examples), ncols=80)
    while True:
      try:
        predictions.append(sess.run(pred_op))
        pbar.update(len(predictions[-1]))
      except tf.errors.OutOfRangeError:
        break
    pbar.close()

  predictions = np.concatenate(predictions, 0)
  predictions = {k.id: p for p, k in zip(predictions, all_examples)}
  if sample is None and cache:
    with open(output, "wb") as f:
      pickle.dump(predictions, f)
  return predictions
예제 #8
0
 def get_model(self) -> TextModel:
     return load_pickle(join(self.dir, "model.pkl"))