Exemplo n.º 1
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                self.Init()
                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, False)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                while len(forest.roots) > 1:

                    self.__evaluate(forest, False)
                    bestParent, bestChild, bestScore = None, None, float(
                        "-inf")
                    bestIndex, bestOp = None, None
                    roots = forest.roots

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                if bestScore < roots[i].scores[irel][op] and (
                                        i + (1 - op)) > 0:
                                    bestParent, bestChild = i + op, i + (1 -
                                                                         op)
                                    bestScore = roots[i].scores[irel][op]
                                    bestIndex, bestOp = i, op
                                    bestRelation, bestIRelation = rel, irel

                    for j in xrange(
                            max(0, bestIndex - self.k - 1),
                            min(len(forest.roots), bestIndex + self.k + 2)):
                        roots[j].scores = None

                    roots[bestChild].pred_parent_id = forest.roots[
                        bestParent].id
                    roots[bestChild].pred_relation = bestRelation

                    roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[
                        bestOp].add_input((self.activation(
                            self.lstm2lstmbias + self.lstm2lstm * concatenate([
                                roots[bestChild].lstms[0].output(),
                                lookup(self.model["rels-lookup"], bestIRelation
                                       ), roots[bestChild].lstms[1].output()
                            ]))))

                    forest.Attach(bestParent, bestChild)

                renew_cg()
                yield sentence
Exemplo n.º 2
0
    def Train(self, conll_path):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        eerrors = 0
        lerrors = 0
        etotal = 0
        ltotal = 0

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            eeloss = 0.0

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (
                        float(eerrors)) / etotal, 'Labeled Errors:', (
                            float(lerrors) /
                            etotal), 'Time', time.time() - start
                    start = time.time()
                    eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    lerrors = 0
                    ltotal = 0

                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, True)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                unassigned = {
                    entry.id: sum([
                        1 for pentry in sentence
                        if pentry.parent_id == entry.id
                    ])
                    for entry in sentence
                }

                while len(forest.roots) > 1:
                    self.__evaluate(forest, True)
                    bestValidOp, bestValidScore = None, float("-inf")
                    bestWrongOp, bestWrongScore = None, float("-inf")

                    bestValidParent, bestValidChild = None, None
                    bestValidIndex, bestWrongIndex = None, None
                    roots = forest.roots

                    rootsIds = set([root.id for root in roots])

                    for i in xrange(len(forest.roots) - 1):
                        for irel, rel in enumerate(self.irels):
                            for op in xrange(2):
                                child = i + (1 - op)
                                parent = i + op

                                oracleCost = unassigned[roots[child].id] + (
                                    0 if roots[child].parent_id not in rootsIds
                                    or roots[child].parent_id
                                    == roots[parent].id else 1)

                                if oracleCost == 0 and (
                                        roots[child].parent_id !=
                                        roots[parent].id
                                        or roots[child].relation == rel):
                                    if bestValidScore < forest.roots[i].scores[
                                            irel][op]:
                                        bestValidScore = forest.roots[
                                            i].scores[irel][op]
                                        bestValidOp = op
                                        bestValidParent, bestValidChild = parent, child
                                        bestValidIndex = i
                                        bestValidIRel, bestValidRel = irel, rel
                                        bestValidExpr = roots[
                                            bestValidIndex].exprs[
                                                bestValidIRel][bestValidOp]
                                elif bestWrongScore < forest.roots[i].scores[
                                        irel][op]:
                                    bestWrongScore = forest.roots[i].scores[
                                        irel][op]
                                    bestWrongParent, bestWrongChild = parent, child
                                    bestWrongOp = op
                                    bestWrongIndex = i
                                    bestWrongIRel, bestWrongRel = irel, rel
                                    bestWrongExpr = roots[
                                        bestWrongIndex].exprs[bestWrongIRel][
                                            bestWrongOp]

                    if bestValidScore < bestWrongScore + 1.0:
                        loss = bestWrongExpr - bestValidExpr
                        mloss += 1.0 + bestWrongScore - bestValidScore
                        eloss += 1.0 + bestWrongScore - bestValidScore
                        errs.append(loss)

                    if not self.oracle or bestValidScore - bestWrongScore > 1.0 or (
                            bestValidScore > bestWrongScore
                            and random.random() > 0.1):
                        selectedOp = bestValidOp
                        selectedParent = bestValidParent
                        selectedChild = bestValidChild
                        selectedIndex = bestValidIndex
                        selectedIRel, selectedRel = bestValidIRel, bestValidRel
                    else:
                        selectedOp = bestWrongOp
                        selectedParent = bestWrongParent
                        selectedChild = bestWrongChild
                        selectedIndex = bestWrongIndex
                        selectedIRel, selectedRel = bestWrongIRel, bestWrongRel

                    if roots[selectedChild].parent_id != roots[
                            selectedParent].id or selectedRel != roots[
                                selectedChild].relation:
                        lerrors += 1
                        if roots[selectedChild].parent_id != roots[
                                selectedParent].id:
                            errors += 1
                            eerrors += 1

                    etotal += 1

                    for j in xrange(
                            max(0, selectedIndex - self.k - 1),
                            min(len(forest.roots),
                                selectedIndex + self.k + 2)):
                        roots[j].scores = None

                    unassigned[roots[selectedChild].parent_id] -= 1

                    roots[selectedParent].lstms[selectedOp] = roots[
                        selectedParent].lstms[selectedOp].add_input(
                            self.activation(self.lstm2lstm * noise(
                                concatenate([
                                    roots[selectedChild].lstms[0].output(),
                                    lookup(self.model["rels-lookup"],
                                           selectedIRel),
                                    roots[selectedChild].lstms[1].output()
                                ]), 0.0) + self.lstm2lstmbias))

                    forest.Attach(selectedParent, selectedChild)

                if len(errs) > 50.0:
                    eerrs = ((esum(errs)) * (1.0 / (float(len(errs)))))
                    scalar_loss = eerrs.scalar_value()
                    eerrs.backward()
                    self.trainer.update()
                    errs = []
                    lerrs = []

                    renew_cg()
                    self.Init()

        if len(errs) > 0:
            eerrs = (esum(errs)) * (1.0 / (float(len(errs))))
            eerrs.scalar_value()
            eerrs.backward()
            self.trainer.update()

            errs = []
            lerrs = []

            renew_cg()

        self.trainer.update_epoch()
        print "Loss: ", mloss / iSentence
Exemplo n.º 3
0
    def Train(self, conll_path, options):
        mloss = 0.0
        errors = 0
        batch = 0
        eloss = 0.0
        #eerrors = 0
        #lerrors = 0
        etotal = 0
        #ltotal = 0
        max_quotient = float("-inf")
        min_quotient = float("inf")
        NUM_SAMPLES = options.num_samples  #default 10

        start = time.time()

        with open(conll_path, 'r') as conllFP:
            shuffledData = list(read_conll(conllFP, True))
            random.shuffle(shuffledData)

            errs = []
            #eeloss = 0.0
            batch_errs = []

            self.Init()

            for iSentence, sentence in enumerate(shuffledData):
                if iSentence % 100 == 0 and iSentence != 0:
                    print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Time', time.time(
                    ) - start
                    #print 'Processing sentence number:', iSentence, 'Loss:', eloss / etotal, 'Errors:', (float(eerrors)) / etotal, 'Labeled Errors:', (float(lerrors) / etotal) , 'Time', time.time()-start
                    start = time.time()
                    #eerrors = 0
                    eloss = 0.0
                    etotal = 0
                    #lerrors = 0
                    #ltotal = 0
                sample_errs = []
                sample_quotients = []
                #print('Sentence: {}'.format(sentence))
                DEBUG = random.random() < 0.0001
                if DEBUG:
                    print("Train sentence: {}".format(
                        [e.form for e in sentence]))
                for _ in xrange(NUM_SAMPLES):

                    forest = ParseForest(sentence)
                    self.getWordEmbeddings(forest, True)

                    for root in forest.roots:
                        root.lstms = [
                            self.builders[0].initial_state().add_input(
                                root.vec),
                            self.builders[1].initial_state().add_input(
                                root.vec)
                        ]

                    unassigned = {
                        entry.id: sum([
                            1 for pentry in sentence
                            if pentry.parent_id == entry.id
                        ])
                        for entry in sentence
                    }

                    #loss = 0
                    log_q_total = 0.0
                    log_p_total = 0.0
                    while len(forest.roots) > 1:
                        self.__evaluate(
                            forest, True)  #NOTE(prkriley): this updates scores
                        roots = forest.roots

                        rootsIds = set([root.id for root in roots])

                        def _isValid(i):
                            return (unassigned[roots[i].id] == 0) and (
                                (i > 0
                                 and roots[i].parent_id == roots[i - 1].id) or
                                (i < len(roots) - 1
                                 and roots[i].parent_id == roots[i + 1].id))

                        valid_zs = [
                            j for j in xrange(1, len(roots)) if _isValid(j)
                        ]

                        z_scores = concatenate([r.zexpr for r in roots[1:]])
                        valid_z_scores = concatenate(
                            [roots[j].zexpr for j in valid_zs])
                        p_zs = softmax(z_scores)
                        #print("P(z): {}".format(p_zs.npvalue()))
                        q_temperature = 16.0
                        q_zs = softmax(valid_z_scores * 1.0 / q_temperature)
                        q_zs_numpy = q_zs.npvalue()
                        q_zs_numpy /= np.sum(q_zs_numpy)
                        if DEBUG:
                            print("Valid z indices: {}".format(valid_zs))
                            print("Q(z): {}".format(q_zs_numpy))

                        valid_i = np.random.choice(len(valid_zs), p=q_zs_numpy)
                        q_z = pick(q_zs, valid_i)
                        i = valid_zs[valid_i]
                        log_q_total += log(q_z).scalar_value()
                        p_z = pick(p_zs, i - 1)
                        log_p_total += log(p_z).scalar_value()

                        irel = list(self.irels).index(roots[i].relation)
                        op = 0 if roots[i].parent_id == roots[i - 1].id else 1
                        #TODO(prkriley): verify correctness of this index math
                        presoftmax_p_y = [
                            val for tup in roots[i].exprs for val in tup
                        ]
                        if i < len(roots) - 1:
                            neglog_p_y = pickneglogsoftmax(
                                concatenate(presoftmax_p_y), irel * 2 + op)
                        else:
                            assert op == 0
                            presoftmax_p_y = presoftmax_p_y[::2]
                            neglog_p_y = pickneglogsoftmax(
                                concatenate(presoftmax_p_y), irel)

                        neglog_p_z = pickneglogsoftmax(z_scores, i - 1)
                        errs.append(neglog_p_y + neglog_p_z)
                        log_p_total -= neglog_p_y.scalar_value()
                        mloss += neglog_p_y.scalar_value()
                        mloss += neglog_p_z.scalar_value()

                        etotal += 1

                        selectedChild = i
                        selectedIndex = i
                        selectedOp = op
                        selectedParent = i + [-1, 1][op]
                        selectedIRel = irel

                        for j in xrange(
                                max(0, selectedIndex - self.k - 2),
                                min(len(forest.roots),
                                    selectedIndex + self.k + 2)):
                            roots[j].scores = None

                        #NOTE(prkriley): counts number of real children that are still gettable
                        unassigned[roots[selectedChild].parent_id] -= 1

                        #NOTE(prkriley): I think lstms[0] is the right one, [1] is the left...
                        roots[selectedParent].lstms[selectedOp] = roots[
                            selectedParent].lstms[selectedOp].add_input(
                                self.activation(self.lstm2lstm * noise(
                                    concatenate([
                                        roots[selectedChild].lstms[0].output(),
                                        lookup(self.model["rels-lookup"],
                                               selectedIRel),
                                        roots[selectedChild].lstms[1].output()
                                    ]), 0.0) + self.lstm2lstmbias))

                        forest.Attach(selectedParent, selectedChild)

                    #END OF SINGLE SAMPLE
                    #TODO(prkriley): finalize loss, do update, etc
                    eerrs = (
                        (esum(errs)) * (1.0 / (float(len(errs))))
                    )  #TODO(prkriley): consider removing this division
                    #eerrs = esum(errs)
                    #TODO(prkriley): scale by p/q which is exp(logp-logq)
                    #print("logp: {}; logq: {}".format(log_p_total, log_q_total))
                    pq_quotient = np.exp(log_p_total - log_q_total)
                    scaled_pq_quotient = pq_quotient * 1e3
                    #scaled_pq_quotient = min(scaled_pq_quotient, 1.5e-5)
                    #scaled_pq_quotient = max(scaled_pq_quotient, 1.5e-8)
                    #eerrs *= scaled_pq_quotient
                    #print("P/Q: {}".format(pq_quotient))
                    max_quotient = max(scaled_pq_quotient, max_quotient)
                    min_quotient = min(scaled_pq_quotient, min_quotient)
                    eloss += eerrs.scalar_value()
                    sample_errs.append(eerrs)
                    sample_quotients.append(scaled_pq_quotient)
                    errs = []

                    DEBUG = False
                #END OF SAMPLING
                #upper_clip = 5e-6
                #lower_clip = 2e-8

                #scale = 1.0
                #if max_quotient < lower_clip:
                #    scale = lower_clip / max_quotient
                ###
                #SCALING QUOTIENTS

                #max_sample_quotient = max(sample_quotients)
                #if max_sample_quotient > upper_clip:
                #    scale = upper_clip / max_sample_quotient
                sum_quotients = sum(sample_quotients)
                PQ_NORMALIZE_SUM = options.pq_norm
                scale = PQ_NORMALIZE_SUM / sum_quotients
                sample_quotients = [e * scale for e in sample_quotients]

                #for q in sample_quotients:
                #    assert q <= upper_clip * 1.1, "Large quotient: {}".format(q)
                ###
                if options.use_pq:
                    sample_errs = [
                        e * q for (e, q) in zip(sample_errs, sample_quotients)
                    ]

                final_error = esum(sample_errs)
                if not options.use_pq:
                    assert len(sample_errs) == NUM_SAMPLES
                    final_error *= (1.0 / (float(len(sample_errs))))

                #TODO(prkriley): put final_error somewhere and update once we have N of them
                batch_errs.append(final_error)
                if len(batch_errs) >= options.batch_size:
                    total_error = esum(batch_errs)
                    total_error.backward()
                    self.trainer.update()
                    batch_errs = []

                    renew_cg()
                    self.Init()

                #final_error.backward()
                #self.trainer.update()

                #renew_cg()
                #self.Init()
            #END OF EPOCH
        #FILE CLOSE

        if options.use_pq:
            print("Max Quotient: {}; Min Quotient: {}".format(
                max_quotient, min_quotient))
        #self.trainer.update_epoch() #TODO(prkriley): verify that AdamTrainer handles everything this did before
        print "Loss: ", mloss / (iSentence * NUM_SAMPLES)
Exemplo n.º 4
0
    def Predict(self, conll_path):
        with open(conll_path, 'r') as conllFP:
            for iSentence, sentence in enumerate(read_conll(conllFP, False)):
                print("Sentence: {}".format([e.form for e in sentence]))
                self.Init()
                forest = ParseForest(sentence)
                self.getWordEmbeddings(forest, False)

                for root in forest.roots:
                    root.lstms = [
                        self.builders[0].initial_state().add_input(root.vec),
                        self.builders[1].initial_state().add_input(root.vec)
                    ]

                ###
                #NOTE(prkriley): looking at truth here, but ONLY for reporting
                unassigned = {
                    entry.id: sum([
                        1 for pentry in sentence
                        if pentry.parent_id == entry.id
                    ])
                    for entry in sentence
                }
                ###
                while len(forest.roots) > 1:

                    self.__evaluate(forest, False)
                    #bestParent, bestChild, bestScore = None, None, float("-inf")
                    #bestIndex, bestOp = None, None
                    roots = forest.roots

                    ###
                    z_scores = concatenate([r.zexpr for r in roots[1:]])
                    p_z = softmax(z_scores).npvalue()
                    bestIndex = np.argmax(p_z) + 1
                    print('P(z): {}'.format(p_z))
                    print('Best index: {} ({})'.format(bestIndex,
                                                       roots[bestIndex].form))
                    valid_exprs = [
                        val for tup in roots[bestIndex].exprs for val in tup
                    ]
                    if bestIndex == len(roots) - 1:
                        valid_exprs = valid_exprs[::2]
                    p_y = softmax(concatenate(valid_exprs))
                    max_y_index = np.argmax(
                        p_y.npvalue()
                    )  #NOTE(prkriley): don't need to actually do softmax just to pick max

                    if bestIndex < len(roots) - 1:
                        bestOp = max_y_index % 2
                        bestIRelation = (max_y_index - bestOp) / 2
                    else:
                        bestOp = 0
                        bestIRelation = max_y_index
                    #TODO(prkriley): make sure op is valid
                    bestChild = bestIndex
                    bestParent = bestIndex + [-1, 1][bestOp]
                    bestRelation = self.irels[bestIRelation]

                    ###
                    ###
                    #NOTE(prkriley): again, using truth but only for reporting
                    def _isValid(i):
                        return (unassigned[roots[i].id] == 0) and (
                            (i > 0 and roots[i].parent_id == roots[i - 1].id)
                            or (i < len(roots) - 1
                                and roots[i].parent_id == roots[i + 1].id))

                    valid_zs = [
                        j for j in xrange(1, len(roots)) if _isValid(j)
                    ]
                    valid_probs = [p_z[j - 1] for j in valid_zs]
                    invalid_probs = [
                        p_z[j - 1] for j in xrange(1, len(roots))
                        if j not in valid_zs
                    ]
                    avg_valid_prob = sum(valid_probs) * 1.0 / len(
                        valid_probs) if valid_probs else -1
                    avg_invalid_prob = sum(invalid_probs) * 1.0 / len(
                        invalid_probs) if invalid_probs else -1
                    print("Avg valid prob: {}/{} = {}".format(
                        sum(valid_probs), len(valid_probs), avg_valid_prob))
                    print("Avg invalid prob: {}/{} = {}".format(
                        sum(invalid_probs), len(invalid_probs),
                        avg_invalid_prob))
                    ###

                    #for j in xrange(max(0, bestIndex - self.k - 1), min(len(forest.roots), bestIndex + self.k + 2)):
                    for j in xrange(
                            max(0, bestIndex - self.k - 2),
                            min(len(forest.roots), bestIndex + self.k + 2)):
                        roots[j].scores = None

                    roots[bestChild].pred_parent_id = forest.roots[
                        bestParent].id
                    roots[bestChild].pred_relation = bestRelation

                    roots[bestParent].lstms[bestOp] = roots[bestParent].lstms[
                        bestOp].add_input((self.activation(
                            self.lstm2lstmbias + self.lstm2lstm * concatenate([
                                roots[bestChild].lstms[0].output(),
                                lookup(self.model["rels-lookup"], bestIRelation
                                       ), roots[bestChild].lstms[1].output()
                            ]))))

                    unassigned[roots[bestChild].parent_id] -= 1
                    forest.Attach(bestParent, bestChild)

                renew_cg()
                yield sentence