示例#1
0
def create_batches(df, ids_corpus, data_type, batch_size, padding_id):

    df = df[df['type'] == data_type]
    data_ids = df['id'].values

    # NO SHUFFLING FOR EVALUATION TO PRINT IN THE SAME ORDER

    N = len(data_ids)

    cnt = 0
    titles, bodies, tag_labels = [], [], []
    batches = []
    ids = []

    for u in xrange(N):
        i = u
        q_id = data_ids[i]
        title, body, tag = ids_corpus[str(q_id)]  # tag is boolean vector
        cnt += 1
        titles.append(title)
        bodies.append(body)
        tag_labels.append(tag)
        ids.append(q_id)

        if cnt == batch_size or u == N - 1:
            titles, bodies, tag_labels = myio.create_one_batch(
                titles, bodies, tag_labels, padding_id)
            batches.append((ids, titles, bodies, tag_labels))

            titles, bodies, tag_labels = [], [], []
            cnt = 0
            ids = []

    return batches
示例#2
0
    def rank(self, query):
        model = self.model
        emb = model.embedding_layer
        args = model.args
        padding_id = model.padding_id
        score_func = self.score_func

        if isinstance(query, str) or isinstance(query, unicode):
            query = json.loads(query)

        p = query["query"].strip().split()
        lst_questions = [emb.map_to_ids(p, filter_oov=True)]
        for q in query["candidates"]:
            q = q.strip().split()
            lst_questions.append(emb.map_to_ids(q, filter_oov=True))
        batch, _ = myio.create_one_batch(lst_questions, lst_questions,
                                         padding_id, not args.average)

        scores = score_func(batch, batch)
        scores = [x for x in scores]
        assert len(scores) == len(batch) - 1
        if ("BM25" in query) and ("ratio" in query):
            BM25 = query["BM25"]
            ratio = query["ratio"]
            assert len(BM25) == len(scores)
            assert ratio >= 0 and ratio <= 1.0
            scores = [
                x * (1 - ratio) + y * ratio for x, y in zip(scores, BM25)
            ]

        ranks = sorted(range(len(scores)), key=lambda i: -scores[i])
        return {"ranks": ranks, "scores": scores}
示例#3
0
文件: api.py 项目: Sundayxr/rcnn
    def rank(self, query):
        model = self.model
        emb = model.embedding_layer
        args = model.args
        padding_id = model.padding_id
        score_func = self.score_func

        if isinstance(query, str) or isinstance(query, unicode):
            query = json.loads(query)

        p = query["query"].strip().split()
        lst_questions = [ emb.map_to_ids(p, filter_oov=True) ]
        for q in query["candidates"]:
            q = q.strip().split()
            lst_questions.append(
                    emb.map_to_ids(q, filter_oov=True)
                )
        batch, _ = myio.create_one_batch(lst_questions,
                    lst_questions,
                    padding_id,
                    not args.average
                )

        scores = score_func(batch, batch)
        scores = [ x for x in scores ]
        assert len(scores) == len(batch)-1
        if ("BM25" in query) and ("ratio" in query):
            BM25 = query["BM25"]
            ratio = query["ratio"]
            assert len(BM25) == len(scores)
            assert ratio >= 0 and ratio <= 1.0
            scores = [ x*(1-ratio)+y*ratio for x,y in zip(scores, BM25) ]

        ranks = sorted(range(len(scores)), key=lambda i: -scores[i])
        return { "ranks": ranks, "scores": scores }
示例#4
0
def create_eval_batches(ids_corpus, data, padding_id, N_neg=20):
    lst = []

    def transform(counter, x, length):
        return ((counter - 1) * length) + x

    for pid, qids, qlabels in data:
        titles = []
        bodies = []
        tag_labels = []
        cnt_q = 0
        tuples = []
        for id in [pid]+qids:
            cnt_q += 1
            title, body, tag = ids_corpus[str(id)]
            titles.append(title)
            bodies.append(body)
            tag_labels.append(tag)

            q_positive_ids = [transform(cnt_q, idx, tag.shape[0]) for idx, label in enumerate(tag) if label == 1]
            q_negative_ids = [transform(cnt_q, idx, tag.shape[0]) for idx, label in enumerate(tag) if label == 0]
            np.random.shuffle(q_negative_ids)
            q_negative_ids = q_negative_ids[:N_neg]  # consider only 20 negatives
            tuples += [[p_id] + q_negative_ids for p_id in q_positive_ids]

        tuples = myio.create_hinge_batch(tuples)
        titles, bodies, tag_labels = myio.create_one_batch(titles, bodies, tag_labels, padding_id)
        lst.append((titles, bodies, np.array(qlabels, dtype="int32"), tag_labels, tuples, pid, qids))

    return lst
示例#5
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
                raw_corpus,
                n_d = args.hidden_dim,
                cut_off = args.cut_off,
                embs = load_embedding_iterator(args.embeddings) if args.embeddings else None
            )
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(
            embedding_layer.n_V,
            len(raw_corpus)
        ))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict((id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                                                if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [ ], args.batch_size,
                    padding_id, bos_id, eos_id, auto_encode=True)
        heldout = [ myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args, embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = [ ]
        train_batches = myio.create_batches(ids_corpus, train, args.batch_size,
                    model.padding_id, model.bos_id, model.eos_id, auto_encode=True)
        say("{} to create batches\n".format(time.time()-start_time))

        model.ready()
        model.train(
                ids_corpus if not args.heldout else train_corpus,
                train,
                dev if args.dev else None,
                test if args.test else None,
                heldout if args.heldout else None
            )
示例#6
0
def create_eval_batches(ids_corpus, data, padding_id, pad_left):
    lst = []
    for pid, qids, qlabels in data:
        titles = []
        bodies = []
        for id in [pid] + qids:
            t, b = ids_corpus[id]
            titles.append(t)
            bodies.append(b)
        titles, bodies = myio.create_one_batch(titles, bodies, padding_id,
                                               pad_left)
        lst.append((titles, bodies, np.array(qlabels,
                                             dtype="int32"), pid, qids))
    return lst
示例#7
0
 def evaluate(self, data, eval_func):
     res = []
     for t, b, labels in data:
         idts, idbs = myio.create_one_batch(t, b, self.padding_id)
         scores = eval_func(idts)
         #assert len(scores) == len(labels)
         ranks = (-scores).argsort()
         ranked_labels = labels[ranks]
         res.append(ranked_labels)
     e = Evaluation(res)
     MAP = e.MAP() * 100
     MRR = e.MRR() * 100
     P1 = e.Precision(1) * 100
     P5 = e.Precision(5) * 100
     return MAP, MRR, P1, P5
示例#8
0
 def evaluate(self, data, eval_func):
     res = [ ]
     for t, b, labels in data:
         idts, idbs = myio.create_one_batch(t, b, self.padding_id)
         scores = eval_func(idts)
         #assert len(scores) == len(labels)
         ranks = (-scores).argsort()
         ranked_labels = labels[ranks]
         res.append(ranked_labels)
     e = Evaluation(res)
     MAP = e.MAP()*100
     MRR = e.MRR()*100
     P1 = e.Precision(1)*100
     P5 = e.Precision(5)*100
     return MAP, MRR, P1, P5
示例#9
0
def main(args):
    raw_corpus = myio.read_corpus(args.corpus)
    embedding_layer = myio.create_embedding_layer(
        raw_corpus,
        n_d=args.hidden_dim,
        cut_off=args.cut_off,
        embs=load_embedding_iterator(args.embeddings)
        if args.embeddings else None)
    ids_corpus = myio.map_corpus(raw_corpus, embedding_layer)
    say("vocab size={}, corpus size={}\n".format(embedding_layer.n_V,
                                                 len(raw_corpus)))
    padding_id = embedding_layer.vocab_map["<padding>"]
    bos_id = embedding_layer.vocab_map["<s>"]
    eos_id = embedding_layer.vocab_map["</s>"]

    if args.reweight:
        weights = myio.create_idf_weights(args.corpus, embedding_layer)

    if args.dev:
        dev = myio.read_annotations(args.dev, K_neg=20, prune_pos_cnt=-1)
        dev = myio.create_eval_batches(ids_corpus, dev, padding_id)
    if args.test:
        test = myio.read_annotations(args.test, K_neg=20, prune_pos_cnt=-1)
        test = myio.create_eval_batches(ids_corpus, test, padding_id)

    if args.heldout:
        with open(args.heldout) as fin:
            heldout_ids = fin.read().split()
        heldout_corpus = dict(
            (id, ids_corpus[id]) for id in heldout_ids if id in ids_corpus)
        train_corpus = dict((id, ids_corpus[id]) for id in ids_corpus
                            if id not in heldout_corpus)
        heldout = myio.create_batches(heldout_corpus, [],
                                      args.batch_size,
                                      padding_id,
                                      bos_id,
                                      eos_id,
                                      auto_encode=True)
        heldout = [
            myio.create_one_batch(b1, t2, padding_id) for t1, b1, t2 in heldout
        ]
        say("heldout examples={}\n".format(len(heldout_corpus)))

    if args.train:
        model = Model(args,
                      embedding_layer,
                      weights=weights if args.reweight else None)

        start_time = time.time()
        train = myio.read_annotations(args.train)
        if not args.use_anno: train = []
        train_batches = myio.create_batches(ids_corpus,
                                            train,
                                            args.batch_size,
                                            model.padding_id,
                                            model.bos_id,
                                            model.eos_id,
                                            auto_encode=True)
        say("{} to create batches\n".format(time.time() - start_time))
        model.ready()

        model.train(ids_corpus if not args.heldout else train_corpus, train,
                    dev if args.dev else None, test if args.test else None,
                    heldout if args.heldout else None)
示例#10
0
    def train(self, ids_corpus, train, dev=None, test=None, heldout=None):
        args = self.args
        dropout_prob = np.float64(args.dropout).astype(theano.config.floatX)
        batch_size = args.batch_size
        padding_id = self.padding_id
        bos_id = self.bos_id
        eos_id = self.eos_id

        #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss)

        updates, lr, gnorm = create_optimization_updates(
            cost=self.cost,
            params=self.params,
            lr=args.learning_rate,
            method=args.learning)[:3]

        train_func = theano.function(inputs=[self.idxs, self.idys],
                                     outputs=[self.cost, self.loss, gnorm],
                                     updates=updates)

        eval_func = theano.function(
            inputs=[self.idxs],
            #outputs = self.scores2
            outputs=self.scores)

        nll_func = theano.function(inputs=[self.idxs, self.idys],
                                   outputs=[self.nll, self.mask])

        say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

        result_table = PrettyTable(
            ["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] +
            ["tst MAP", "tst MRR", "tst P@1", "tst P@5"])

        unchanged = 0
        best_dev = -1
        dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0
        test_MAP = test_MRR = test_P1 = test_P5 = 0
        heldout_PPL = -1

        start_time = 0
        max_epoch = args.max_epoch
        for epoch in xrange(max_epoch):
            unchanged += 1
            if unchanged > 8: break

            start_time = time.time()

            train_batches = myio.create_batches(ids_corpus,
                                                train,
                                                batch_size,
                                                padding_id,
                                                bos_id,
                                                eos_id,
                                                auto_encode=True)
            N = len(train_batches)

            train_cost = 0.0
            train_loss = 0.0
            train_loss2 = 0.0
            for i in xrange(N):
                # get current batch
                t1, b1, t2 = train_batches[i]

                if args.use_title:
                    idxs, idys = myio.create_one_batch(t1, t2, padding_id)
                    cur_cost, cur_loss, grad_norm = train_func(idxs, idys)
                    train_cost += cur_cost
                    train_loss += cur_loss
                    train_loss2 += cur_loss / idys.shape[0]

                if args.use_body:
                    idxs, idys = myio.create_one_batch(b1, t2, padding_id)
                    cur_cost, cur_loss, grad_norm = train_func(idxs, idys)
                    train_cost += cur_cost
                    train_loss += cur_loss
                    train_loss2 += cur_loss / idys.shape[0]

                if i % 10 == 0:
                    say("\r{}/{}".format(i, N))

                if i == N - 1:
                    self.dropout.set_value(0.0)

                    if dev is not None:
                        dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(
                            dev, eval_func)
                    if test is not None:
                        test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(
                            test, eval_func)
                    if heldout is not None:
                        heldout_PPL = self.evaluate_perplexity(
                            heldout, nll_func)

                    if dev_MRR > best_dev:
                        unchanged = 0
                        best_dev = dev_MRR
                        result_table.add_row([epoch] + [
                            "%.2f" % x
                            for x in [dev_MAP, dev_MRR, dev_P1, dev_P5] +
                            [test_MAP, test_MRR, test_P1, test_P5]
                        ])
                        if args.model:
                            self.save_model(args.model + ".pkl.gz")

                    dropout_p = np.float64(args.dropout).astype(
                        theano.config.floatX)
                    self.dropout.set_value(dropout_p)

                    say("\r\n\n")
                    say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \
                        +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format(
                            epoch,
                            train_cost / (i+1),
                            train_loss / (i+1),
                            train_loss2 / (i+1),
                            dev_MRR,
                            best_dev,
                            heldout_PPL,
                            float(grad_norm),
                            (time.time()-start_time)/60.0
                    ))
                    say("\tp_norm: {}\n".format(self.get_pnorm_stat()))

                    say("\n")
                    say("{}".format(result_table))
                    say("\n")
示例#11
0
    def train(self, ids_corpus, train, dev=None, test=None, heldout=None):
        args = self.args
        dropout_prob = np.float64(args.dropout).astype(theano.config.floatX)
        batch_size = args.batch_size
        padding_id = self.padding_id
        bos_id = self.bos_id
        eos_id = self.eos_id

        #train_batches = myio.create_batches(ids_corpus, train, batch_size, padding_id, args.loss)

        updates, lr, gnorm = create_optimization_updates(
                cost = self.cost,
                params = self.params,
                lr = args.learning_rate,
                method = args.learning
            )[:3]

        train_func = theano.function(
                inputs = [ self.idxs, self.idys ],
                outputs = [ self.cost, self.loss, gnorm ],
                updates = updates
            )

        eval_func = theano.function(
                inputs = [ self.idxs ],
                #outputs = self.scores2
                outputs = self.scores
            )

        nll_func = theano.function(
                inputs = [ self.idxs, self.idys ],
                outputs = [ self.nll, self.mask ]
            )

        say("\tp_norm: {}\n".format(
                self.get_pnorm_stat()
            ))

        result_table = PrettyTable(["Epoch", "dev MAP", "dev MRR", "dev P@1", "dev P@5"] +
                                    ["tst MAP", "tst MRR", "tst P@1", "tst P@5"])

        unchanged = 0
        best_dev = -1
        dev_MAP = dev_MRR = dev_P1 = dev_P5 = 0
        test_MAP = test_MRR = test_P1 = test_P5 = 0
        heldout_PPL = -1

        start_time = 0
        max_epoch = args.max_epoch
        for epoch in xrange(max_epoch):
            unchanged += 1
            if unchanged > 8: break

            start_time = time.time()

            train_batches = myio.create_batches(ids_corpus, train, batch_size,
                                    padding_id, bos_id, eos_id, auto_encode=True)
            N =len(train_batches)

            train_cost = 0.0
            train_loss = 0.0
            train_loss2 = 0.0
            for i in xrange(N):
                # get current batch
                t1, b1, t2 = train_batches[i]

                if args.use_title:
                    idxs, idys = myio.create_one_batch(t1, t2, padding_id)
                    cur_cost, cur_loss, grad_norm = train_func(idxs, idys)
                    train_cost += cur_cost
                    train_loss += cur_loss
                    train_loss2 += cur_loss / idys.shape[0]

                if args.use_body:
                    idxs, idys = myio.create_one_batch(b1, t2, padding_id)
                    cur_cost, cur_loss, grad_norm = train_func(idxs, idys)
                    train_cost += cur_cost
                    train_loss += cur_loss
                    train_loss2 += cur_loss / idys.shape[0]

                if i % 10 == 0:
                    say("\r{}/{}".format(i,N))

                if i == N-1:
                    self.dropout.set_value(0.0)

                    if dev is not None:
                        dev_MAP, dev_MRR, dev_P1, dev_P5 = self.evaluate(dev, eval_func)
                    if test is not None:
                        test_MAP, test_MRR, test_P1, test_P5 = self.evaluate(test, eval_func)
                    if heldout is not None:
                        heldout_PPL = self.evaluate_perplexity(heldout, nll_func)

                    if dev_MRR > best_dev:
                        unchanged = 0
                        best_dev = dev_MRR
                        result_table.add_row(
                            [ epoch ] +
                            [ "%.2f" % x for x in [ dev_MAP, dev_MRR, dev_P1, dev_P5 ] +
                                        [ test_MAP, test_MRR, test_P1, test_P5 ] ]
                        )
                        if args.model:
                            self.save_model(args.model+".pkl.gz")

                    dropout_p = np.float64(args.dropout).astype(
                                theano.config.floatX)
                    self.dropout.set_value(dropout_p)

                    say("\r\n\n")
                    say( ( "Epoch {}\tcost={:.3f}\tloss={:.3f} {:.3f}\t" \
                        +"\tMRR={:.2f},{:.2f}\tPPL={:.1f}\t|g|={:.3f}\t[{:.3f}m]\n" ).format(
                            epoch,
                            train_cost / (i+1),
                            train_loss / (i+1),
                            train_loss2 / (i+1),
                            dev_MRR,
                            best_dev,
                            heldout_PPL,
                            float(grad_norm),
                            (time.time()-start_time)/60.0
                    ))
                    say("\tp_norm: {}\n".format(
                            self.get_pnorm_stat()
                        ))

                    say("\n")
                    say("{}".format(result_table))
                    say("\n")