示例#1
0
    def _run_epoch_dev_part(self, sess, dataset_dev):
        '''
        test ("dev") for one epoch
        returns predicted answers
        might be able to merge with _run_epoch_val_part. didn't bother
        '''
        si_pred = []
        ei_pred = []
        prog = util.Progbar(
                target=1 + int(len(dataset_dev['.ids.context']) / \
                               FLAGS.batch_size))
        for i, batch in enumerate(self.get_mini(dataset_dev,
                                                FLAGS.batch_size,
                                                shuffle=False,
                                                span=False)):

            si_all, ei_all, mem = self._run_epoch_dev_minibatch(sess, batch)
            mem = int(mem)>>20
            prog.update(i + 1, exact=[('mem', mem)])
            si_pred.append(si_all)  # order should be preserved. shuffle=False
            ei_pred.append(ei_all)

        si_pred = np.concatenate(si_pred)
        ei_pred = np.concatenate(ei_pred)
        span = self.pick_si_ei(si_pred, ei_pred)
        answers = self.span_to_pred(span, dataset_dev['.ids.context'])
        return answers
示例#2
0
    def _run_epoch_val_part(self, sess, dataset_val):
        '''
        validates for one epoch
        returns validation metric
        which is the harmonic mean of f1 and em
        '''
        si_pred = []
        ei_pred = []
        #losses = []
        prog = util.Progbar(
                target=1 + int(len(dataset_val['.ids.context']) / \
                               FLAGS.batch_size))
        for i, batch in enumerate(self.get_mini(dataset_val,
                                                FLAGS.batch_size,
                                                shuffle=False,
                                                span=True)):

            si_all, ei_all, loss = self._run_epoch_val_minibatch(sess, batch)
            prog.update(i + 1, exact=[('val loss', loss)])
            si_pred.append(si_all)  # order should be preserved. shuffle=False
            ei_pred.append(ei_all)
            #losses.append(loss)

        si_pred = np.concatenate(si_pred)
        ei_pred = np.concatenate(ei_pred)
        span = self.pick_si_ei(si_pred, ei_pred)
        answers = self.span_to_pred(span, dataset_val['.ids.context'])
        f1, em = self.evaluate(answers, dataset_val['.answer'])
        #f1f1em = 2/(1/f1+1/em)
        f1f1em = 2/(1/max(f1, 1e-10)+1/max(em, 1e-10))  # for small datasets
        logging.info('last val loss {}'.format(loss))
        logging.info("F1: {}, EM: {}, F1F1EM: {}".format(f1, em, f1f1em))
        return f1f1em
示例#3
0
    def _run_epoch_train_part(self, sess, dataset_train):
        '''
        trains for one epoch
        '''
        prog = util.Progbar(
                target=1 + int(len(dataset_train['.ids.context']) / \
                               FLAGS.batch_size))
        for i, batch in enumerate(self.get_mini(dataset_train,
                                                FLAGS.batch_size,
                                                shuffle=True,
                                                span=True)):
            out = self._run_epoch_train_minibatch(sess, batch)
            #_, loss, gnorm, mem, grad, var = out
            _, loss, gnorm, mem = out
            mem = int(mem)>>20
            prog.update(i + 1, exact=[('train loss', loss),
                                      ('gnorm', gnorm),
                                      ('mem', mem)])  # mb
            if np.isnan(gnorm):
                logging.info('gnorm nan')
                #np.save('nan_grad', grad)
                #np.save('nan_var', var)
                raise Exception('gnorm nan')

        logging.info(
            'last train loss {}, gnorm {}, mem {}'.format(loss, gnorm, mem))
示例#4
0
def docs(dataset_name):
    p = util.Progbar(target=(util.lines_in_file(directories.RAW +
                                                dataset_name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW +
                                               dataset_name)):
        p.update(i + 1)
        yield d
示例#5
0
    def __init__(self,
                 trainer,
                 docs,
                 data,
                 message,
                 replay_memory=None,
                 beta=0,
                 docs_per_iteration=10000):
        self.trainer = trainer
        self.data = data
        self.model = trainer.model
        self.message = message
        self.replay_memory = replay_memory
        self.beta = beta
        self.loss_aggregator = Aggregator()
        self.evaluators = [
            evaluation.Evaluator(metric=evaluation.muc),
            evaluation.Evaluator(metric=evaluation.b_cubed),
            evaluation.Evaluator(metric=evaluation.ceafe)
        ]
        self.merged_pairs = {}
        self.training = self.replay_memory is not None

        print self.message
        random.shuffle(docs)
        if self.training:
            docs = docs[:docs_per_iteration]
        prog = util.Progbar(len(docs))
        for i, (doc, actionstate) in enumerate(docs):
            self.trainer.doc = doc
            self.trainer.actionstate = actionstate

            if len(actionstate.possible_pairs) != 0:
                actionstate.load(self.data, self.trainer.pair_model,
                                 self.trainer.anaphoricity_model)
                s = State(doc, actionstate)
                doc_merged_pairs = self.run_agent(s, beta, i)
                for evaluator in self.evaluators:
                    evaluator.update(doc)
                self.merged_pairs[doc.did] = doc_merged_pairs
                doc.reset()
                actionstate.clear()

            muc, b3, ceafe = (self.evaluators[i].get_f1() for i in range(3))
            exact = [('muc', 100 * muc), ('b3', 100 * b3),
                     ('ceafe', 100 * ceafe),
                     ('conll', 100 * (muc + b3 + ceafe) / 3),
                     ('loss', self.loss_aggregator.get_avg())]
            prog.update(i + 1, exact=exact)
示例#6
0
    def train_all(self):
        timer.start("train")

        model_weights = self.model.get_weights()
        prog = util.Progbar(len(self.memory))
        random.shuffle(self.memory)
        for i, X in enumerate(self.memory):
            loss = self.train_on_example(X)
            prog.update(i + 1, [("loss", loss)])
        self.size = 0
        self.memory = []
        timer.stop("train")
        weight_diffs = [(np.sum(np.abs(new_weight - old_weight)),
                         new_weight.size) for new_weight, old_weight in zip(
                             self.model.get_weights(), model_weights)]
        summed = np.sum(map(np.array, weight_diffs), axis=0)
        print "weight diffs", weight_diffs, summed
示例#7
0
def build_dataset(vectors,
                  name,
                  tune_fraction=0.0,
                  reduced=False,
                  columns=None):
    doc_vectors = util.load_pickle(directories.MISC +
                                   name.replace("_reduced", "") +
                                   "_document_vectors.pkl")

    main_pairs = PairDataBuilder(columns)
    tune_pairs = PairDataBuilder(columns)
    main_mentions = MentionDataBuilder(columns)
    tune_mentions = MentionDataBuilder(columns)
    main_docs = DocumentDataBuilder(columns)
    tune_docs = DocumentDataBuilder(columns)

    print "Building dataset", name
    p = util.Progbar(
        target=(2 if reduced else util.lines_in_file(directories.RAW + name)))
    for i, d in enumerate(util.load_json_lines(directories.RAW + name)):
        if reduced and i > 2:
            break
        p.update(i + 1)

        if reduced and tune_fraction != 0:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if i == 0 else (tune_pairs, tune_mentions, tune_docs)
        else:
            pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \
                if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs)

        ms, ps = mentions.size(), pairs.size()
        mention_positions = {}
        for mention_num in sorted(d["mentions"].keys(), key=int):
            mention_positions[mention_num] = mentions.size()
            mentions.add_mention(
                d["mentions"][mention_num], vectors,
                doc_vectors[d["mentions"][mention_num]["doc_id"]])

        for key in sorted(d["labels"].keys(),
                          key=lambda k:
                          (int(k.split()[1]), int(k.split()[0]))):
            k1, k2 = key.split()
            pairs.add_pair(d["labels"][key], mention_positions[k1],
                           mention_positions[k2],
                           int(d["mentions"][k1]["doc_id"]),
                           int(d["mentions"][k1]["mention_id"]),
                           int(d["mentions"][k2]["mention_id"]),
                           d["pair_features"][key])

        me, pe = mentions.size(), pairs.size()
        docs.add_doc(ms, me, ps, pe, d["document_features"])

    suffix = ("_reduced" if reduced else "")
    if tune_mentions.size() > 0:
        tune_mentions.write(name + "_tune" + suffix)
        tune_pairs.write(name + "_tune" + suffix)
        tune_docs.write(name + "_tune" + suffix)
        main_mentions.write(name + "_train" + suffix)
        main_pairs.write(name + "_train" + suffix)
        main_docs.write(name + "_train" + suffix)
    else:
        main_mentions.write(name + suffix)
        main_pairs.write(name + suffix)
        main_docs.write(name + suffix)