Exemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("vecs")
    parser.add_argument("vocab")
    parser.add_argument("output")
    args = parser.parse_args()

    voc = set()
    with open(args.vocab) as f:
        for line in f:
            voc.add(line.strip())

    voc = load_word_vectors(args.vecs, voc)
    with open(args.output, "wb") as f:
        pickle.dump(voc, f)
Exemplo n.º 2
0
def build_model_and_evaluator_runner(model_config, max_answer_len,
                                     n_paragraphs):
    with open(model_config.model_pickle_file, 'rb') as f:
        model = pickle.load(f)

    model.lm_model.weight_file = model_config.lm_weights_file
    model.lm_model.lm_vocab_file = model_config.vocab_file
    model.lm_model.embed_weights_file = model_config.lm_token_weights_file
    model.lm_model.options_file = model_config.lm_options_file
    model.word_embed.vec_name = model_config.word_vector_file
    vocab_to_ignore = {'<S>', '</S>', '<UNK>', '!!!MAXTERMID'}

    vocab_to_init_with = {
        line.strip()
        for line in open(model_config.vocab_file, encoding="utf-8")
        if line.strip() not in vocab_to_ignore
    }

    #evaluator_runner = AysncEvaluatorRunner([RecordParagraphSpanPrediction(max_answer_len, True)], model, 10)
    sess = tf.Session()
    with sess.as_default():
        model.set_input_spec(ParagraphAndQuestionSpec(None, None, None, 14),
                             vocab_to_init_with,
                             word_vec_loader=ResourceLoader(
                                 load_vec_fn=lambda x, y: load_word_vectors(
                                     x, y, is_path=True)))
        evaluator_runner = AysncEvaluatorRunner(
            [RecordParagraphSpanPrediction(max_answer_len, True)], model, 10)

        input_dict = {
            p: x
            for p, x in zip(model.get_placeholders(),
                            evaluator_runner.dequeue_op)
        }
        pred = model.get_predictions_for(input_dict)
    evaluator_runner.set_input(pred)

    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars_to_restore = [x for x in all_vars if x.name not in lm_var_names]
    saver = tf.train.Saver(vars_to_restore)
    saver.restore(sess, model_config.checkpoint_file)
    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in lm_var_names]))

    return sess, model, evaluator_runner
 def _build_model(self):
     vocab_to_init_with = {
         line.strip()
         for line in open(self.config.vocab_file, encoding="utf-8")
         if line.strip() not in vocab_to_ignore
     }
     self.model.word_embed.vec_name = self.config.word_vector_file
     with self.sess.as_default():
         self.model.set_input_spec(
             ParagraphAndQuestionSpec(None, None, None, 14),
             vocab_to_init_with,
             word_vec_loader=ResourceLoader(
                 load_vec_fn=lambda x, y: load_word_vectors(
                     x, y, is_path=True)))
         pred = self.model.get_production_predictions_for(
             {x: x
              for x in self.model.get_placeholders()})
     return pred.start_logits, pred.end_logits, self.model.context_rep
Exemplo n.º 4
0
    def get_pruned_word_vecs(self, word_vec_name, voc=None):
        """
        Loads word vectors that have been pruned to the case-insensitive vocab of this corpus.
        WARNING: this includes dev words

        This exists since loading word-vecs each time we startup can be a big pain, so
        we cache the pruned vecs on-disk as a .npy file we can re-load quickly.
        """

        vec_file = join(self.dir, word_vec_name + self.WORD_VEC_SUFFIX + ".npy")
        if isfile(vec_file):
            print("Loading word vec %s for %s from cache" % (word_vec_name, self.name))
            with open(vec_file, "rb") as f:
                return pickle.load(f)
        else:
            print("Building pruned word vec %s for %s" % (self.name, word_vec_name))
            voc = self.get_vocab()
            vecs = load_word_vectors(word_vec_name, voc)
            with open(vec_file, "wb") as f:
                pickle.dump(vecs, f)
            return vecs
 def load_word_vec(self, vec_name, voc=None):
     return load_word_vectors(join(self.path, vec_name), voc, True)