Exemplo n.º 1
0
        conceptFileName = a
    elif o == "--wordMap":
        wordFileName = a
    
if verbose:
    print("Start")
    print("-------------------------------------------------")

    print("Smoothing concept1GivenC2C3C4.")

conceptMap = LexMap().read(conceptFileName)
wordMap = LexMap().read(wordFileName)
conceptCard = len(conceptMap)
wordCard = len(wordMap)

jointDtC1C2C3C4 = gmtk.readDt(wordDtFileName)
jointDcptC1C2C3C4 = gmtk.readDcpt(dcptsFileName, "jointProbC1C2C3C4")
assert toolkit.testProb(jointDcptC1C2C3C4), "Sum of probabilities should be always 1."

jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4) 

# desable generating _EMPTY_ by _EMPTY_, _EMPTY_, _EMPTY_
# jointProbC1C2C3C4.setValue([int(conceptMap["_EMPTY_"]), 
#                             int(conceptMap["_EMPTY_"]), 
#                             int(conceptMap["_EMPTY_"]), 
#                             int(conceptMap["_EMPTY_"])], 0)

##concept1DtC2C3C4 = gmtk.readDt(concept1DtFileName)
##concept1DpmfC2C3C4 = gmtk.readDpmf(dpmfsFileName, "concept1GivenC2C3C4")
##assert toolkit.testProb2(concept1DpmfC2C3C4), "Sum of probabilities should be always 1."
##
    def main(self, conceptMap, symbolMap, symbolDt, dcpts, dpmfs, symbolName, outDir):
        conceptMap = LexMap().read(conceptMap)
        symbolMap = LexMap().read(symbolMap)

        _sink_ = int(symbolMap["_sink_"])
        _SINK_ = int(conceptMap["_SINK_"])

        conceptCard = len(conceptMap)
        symbolCard = len(symbolMap)

        jointDtC1C2C3C4 = gmtk.readDt(symbolDt)
        jointDcptC1C2C3C4 = gmtk.readDcpt(dcpts, "jointProbC1C2C3C4")
        jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4)

        symbolDpmfC1C2C3C4 = gmtk.readDpmf(dpmfs, "%sGivenC1C2C3C4" % symbolName)
        symbolGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, symbolDpmfC1C2C3C4)

        jointProbSymbolC1C2C3C4 = symbolGivenC1C2C3C4.multiple([1, 2, 3, 4], jointProbC1C2C3C4)
        gmtk.saveDpmfsProbs(
            outDir,
            "%sGivenC1C2C3C4" % symbolName,
            len(symbolGivenC1C2C3C4.vectSubList([1, 2, 3, 4])),
            symbolCard,
            symbolGivenC1C2C3C4,
        )

        jointProbSymbolC1C2C3 = jointProbSymbolC1C2C3C4.marginalize([0, 1, 2, 3])
        symbolGivenC1C2C3 = jointProbSymbolC1C2C3.conditionalize([1, 2, 3])
        gmtk.saveDpmfsProbs(
            outDir,
            "%sGivenC1C2C3" % symbolName,
            len(symbolGivenC1C2C3.vectSubList([1, 2, 3])),
            symbolCard,
            symbolGivenC1C2C3,
        )

        jointProbSymbolC1C2 = jointProbSymbolC1C2C3.marginalize([0, 1, 2])
        symbolGivenC1C2 = jointProbSymbolC1C2.conditionalize([1, 2])
        gmtk.saveDpmfsProbs(
            outDir, "%sGivenC1C2" % symbolName, len(symbolGivenC1C2.vectSubList([1, 2])), symbolCard, symbolGivenC1C2
        )

        jointProbSymbolC1 = jointProbSymbolC1C2.marginalize([0, 1])
        symbolGivenC1 = jointProbSymbolC1.conditionalize([1])

        # in case of conditioning by _SINK_ I have to enable to generate _sink_ word only
        # otherwise I would see _SINK_ concept in the stack
        # TODO: Turn it into validator
        symbolGivenC1.setValue([_sink_, _SINK_], 1)

        gmtk.saveDcptBigram(outDir, "%sGivenC1" % symbolName, symbolCard, conceptCard, symbolGivenC1)

        symbolUnigram = jointProbSymbolC1.marginalize([0])

        # I need to enable to decode _unseen_ word only ! so set the probability of
        # generating _empty_ to zero
        symbolUnigram.setValue([int(symbolMap["_empty_"])], 0)
        # normalize sum of probabilities to one
        symbolUnigram = symbolUnigram.normJoint()

        gmtk.saveDcptUnigram(outDir, "%sUnigram" % symbolName, symbolCard, symbolUnigram)

        gmtk.saveDcptUnseen(outDir, "%sZerogram" % symbolName, symbolCard, symbolMap)
Exemplo n.º 3
0
        pushCard = int(a)
    elif o == "--penalty":
        penalty = float(a)

if verbose:
    print("Start")
    print("-------------------------------------------------")

    print("Smoothing pushGivenC1C2C3C4.")

conceptMap = LexMap().read(conceptFileName)
conceptCard = len(conceptMap)

# read P(C1, C2, C3, C4)
##############################################################################
jointDtC1C2C3C4 = gmtk.readDt(pushDtFileName)
jointDcptC1C2C3C4 = gmtk.readDcpt(dcptsFileName, "jointProbC1C2C3C4")
assert toolkit.testProb(
    jointDcptC1C2C3C4), "Sum of probabilities should be always 1."

jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4)

# read P(W | C1, C2, C3, C4)
pushDpmfC1C2C3C4 = gmtk.readDpmf(dpmfsFileName, "pushGivenC1C2C3C4")
assert toolkit.testProb2(
    pushDpmfC1C2C3C4), "Sum of probabilities should be always 1."

pushGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, pushDpmfC1C2C3C4)

#
# jointProbPC1C2C3C4 = pushGivenC1C2C3C4 * jointProbC1C2C3C4
    def main(self, conceptMap, symbolMap, symbolDt, dcpts, dpmfs, symbolName,
             outDir):
        conceptMap = LexMap().read(conceptMap)
        symbolMap = LexMap().read(symbolMap)

        _sink_ = int(symbolMap['_sink_'])
        _SINK_ = int(conceptMap['_SINK_'])

        conceptCard = len(conceptMap)
        symbolCard = len(symbolMap)

        jointDtC1C2C3C4 = gmtk.readDt(symbolDt)
        jointDcptC1C2C3C4 = gmtk.readDcpt(dcpts, "jointProbC1C2C3C4")
        jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4,
                                                jointDcptC1C2C3C4)

        symbolDpmfC1C2C3C4 = gmtk.readDpmf(dpmfs,
                                           "%sGivenC1C2C3C4" % symbolName)
        symbolGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4,
                                                  symbolDpmfC1C2C3C4)

        jointProbSymbolC1C2C3C4 = symbolGivenC1C2C3C4.multiple(
            [1, 2, 3, 4], jointProbC1C2C3C4)
        gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2C3C4" % symbolName,
                            len(symbolGivenC1C2C3C4.vectSubList([1, 2, 3, 4])),
                            symbolCard, symbolGivenC1C2C3C4)

        jointProbSymbolC1C2C3 = jointProbSymbolC1C2C3C4.marginalize(
            [0, 1, 2, 3])
        symbolGivenC1C2C3 = jointProbSymbolC1C2C3.conditionalize([1, 2, 3])
        gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2C3" % symbolName,
                            len(symbolGivenC1C2C3.vectSubList([1, 2, 3])),
                            symbolCard, symbolGivenC1C2C3)

        jointProbSymbolC1C2 = jointProbSymbolC1C2C3.marginalize([0, 1, 2])
        symbolGivenC1C2 = jointProbSymbolC1C2.conditionalize([1, 2])
        gmtk.saveDpmfsProbs(outDir, "%sGivenC1C2" % symbolName,
                            len(symbolGivenC1C2.vectSubList([1, 2])),
                            symbolCard, symbolGivenC1C2)

        jointProbSymbolC1 = jointProbSymbolC1C2.marginalize([0, 1])
        symbolGivenC1 = jointProbSymbolC1.conditionalize([1])

        # in case of conditioning by _SINK_ I have to enable to generate _sink_ word only
        # otherwise I would see _SINK_ concept in the stack
        # TODO: Turn it into validator
        symbolGivenC1.setValue([_sink_, _SINK_], 1)

        gmtk.saveDcptBigram(outDir, "%sGivenC1" % symbolName, symbolCard,
                            conceptCard, symbolGivenC1)

        symbolUnigram = jointProbSymbolC1.marginalize([0])

        # I need to enable to decode _unseen_ word only ! so set the probability of
        # generating _empty_ to zero
        symbolUnigram.setValue([int(symbolMap["_empty_"])], 0)
        # normalize sum of probabilities to one
        symbolUnigram = symbolUnigram.normJoint()

        gmtk.saveDcptUnigram(outDir, "%sUnigram" % symbolName, symbolCard,
                             symbolUnigram)

        gmtk.saveDcptUnseen(outDir, "%sZerogram" % symbolName, symbolCard,
                            symbolMap)
        popCard = int(a)
    elif o == "--penalty":
        penalty = float(a)

if verbose:
    print("Start")
    print("-------------------------------------------------")

    print("Smoothing popGivenC1C2C3C4.")

conceptMap = LexMap().read(conceptFileName)
conceptCard = len(conceptMap)

# read P(C1, C2, C3, C4)
##############################################################################
jointDtC1C2C3C4 = gmtk.readDt(popDtFileName)
jointDcptC1C2C3C4 = gmtk.readDcpt(dcptsFileName, "jointProbC1C2C3C4")
assert toolkit.testProb(jointDcptC1C2C3C4), "Sum of probabilities should be always 1."

jointProbC1C2C3C4 = gmtk.combineDtDcpt1(jointDtC1C2C3C4, jointDcptC1C2C3C4) 

# read P(W | C1, C2, C3, C4)
popDpmfC1C2C3C4 = gmtk.readDpmf(dpmfsFileName, "popGivenC1C2C3C4")
assert toolkit.testProb2(popDpmfC1C2C3C4), "Sum of probabilities should be always 1."

popGivenC1C2C3C4 = gmtk.combineDtDcpt2(jointDtC1C2C3C4, popDpmfC1C2C3C4)

#
# jointProbPC1C2C3C4 = popGivenC1C2C3C4 * jointProbC1C2C3C4
#