Пример #1
0
def computeDevLoss():
    global printHere
    counterDev = 0
    corpusDev = CorpusIteratorFuncHead(
        language, "dev").iterator(rejectShortSentences=True)
    partitionsDev = getPartitions(corpusDev)
    devLoss = 0
    devAccuracy = 0
    devAccuracyLabeled = 0
    devWords = 0
    for partitionDev in partitionsDev:
        counterDev += 1
        printHere = (counterDev % 500 == 0)
        loss, _, accuracy, accuracyLabeled, wordNum = forward(
            partitionDev, computeAccuracy=True, doDropout=False)
        devLoss += loss.data.cpu().numpy()
        devAccuracy += accuracy
        devAccuracyLabeled += accuracyLabeled
        devWords += wordNum
        if counterDev % 50 == 0:
            print "Run on dev " + str(counterDev)
            print(devLoss / devWords,
                  float(devAccuracy) / devWords,
                  float(devAccuracyLabeled) / devWords, devWords)

    newDevLoss = devLoss / devWords
    newDevAccuracy = float(devAccuracy) / devWords
    newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords
    devLosses.append(newDevLoss)
    devAccuracies.append(newDevAccuracy)
    devAccuraciesLabeled.append(newDevAccuracyLabeled)
def initializeOrderTable():
   orderTable = {}
   keys = set()
   vocab = {}
   distanceSum = {}
   distanceCounts = {}
   depsVocab = set()
   for partition in ["train", "dev"]:
     for sentence in CorpusIteratorFuncHead(language,partition).iterator():
      for line in sentence:
          vocab[line["word"]] = vocab.get(line["word"], 0) + 1
          line["coarse_dep"] = makeCoarse(line["dep"])
          depsVocab.add(line["coarse_dep"])
          posFine.add(line["posFine"])
          posUni.add(line["posUni"])
  
          if line["coarse_dep"] == "root":
             continue
          posHere = line["posUni"]
          posHead = sentence[line["head"]-1]["posUni"]
          dep = line["coarse_dep"]
          direction = "HD" if line["head"] < line["index"] else "DH"
          key = dep
          keyWithDir = (dep, direction)
          orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
          keys.add(key)
          distanceCounts[key] = distanceCounts.get(key,0.0) + 1.0
          distanceSum[key] = distanceSum.get(key,0.0) + abs(line["index"] - line["head"])
   dhLogits = {}
   for key in keys:
      hd = orderTable.get((key, "HD"), 0) + 1.0
      dh = orderTable.get((key, "DH"), 0) + 1.0
      dhLogit = log(dh) - log(hd)
      dhLogits[key] = dhLogit
   return dhLogits, vocab, keys, depsVocab
def computeDevLoss():
   global printHere
   global counter


   devLoss_lm = 0.0
   devWords_lm = 0



   corpusDev = CorpusIteratorFuncHead(args.language,"dev").iterator(rejectShortSentences = True)

   while True:
     try:
        batch = map(lambda x:next(corpusDev), 10*range(args.batchSize))
     except StopIteration:
        break
     batch = sorted(batch, key=len)
     partitions = range(10)
     shuffle(partitions)
     for partition in partitions:
        counter += 1
        printHere = (counter % 50 == 0)
        current = batch[partition*args.batchSize:(partition+1)*args.batchSize]
 
        fromLM = doForwardPass(current, train=False)
        _, _, _, newLoss_lm, newWords_lm = fromLM


        devLoss_lm += newLoss_lm
        devWords_lm += newWords_lm



   return devLoss_lm/devWords_lm
def computeDevLoss():
    global printHere
    global counter
    devLoss = 0.0
    devLossWords = 0.0
    devLossPOS = 0.0
    devWords = 0
    #   corpusDev = getNextSentence("dev")
    corpusDev = CorpusIteratorFuncHead(
        language, "dev").iterator(rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpusDev), range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(1)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 50 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass(
                current, train=False)
            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords
Пример #5
0
def computeDevLoss():
   global printHere
   global counter
   global devAccuracies_parser
   global devAccuraciesLabeled_parser


   devLoss_lm = 0.0
   devWords_lm = 0

   devLoss_parser = 0.0
   devWords_parser = 0
   devAccuracy_parser = 0
   devAccuracyLabeled_parser = 0

   corpusDev = CorpusIteratorFuncHead(args.language,"dev").iterator(rejectShortSentences = True)

   while True:
     # iterate through the development set
     try:
        batch = map(lambda x:next(corpusDev), 10*range(args.batchSize))
     except StopIteration:
        break
     batch = sorted(batch, key=len)
     partitions = range(10)
     shuffle(partitions)
     for partition in partitions:
        counter += 1
        printHere = (counter % 50 == 0)
        current = batch[partition*args.batchSize:(partition+1)*args.batchSize]

        # run the model on the syntactic trees 
        fromLM, fromParser = doForwardPass(current, train=False, computeAccuracy_parser=True, doDropout_parser=False)
        _, _, _, newLoss_lm, newWords_lm = fromLM
        loss_parser, _, accuracy_parser ,accuracyLabeled_parser, wordNum_parser = fromParser


        devLoss_lm += newLoss_lm
        devWords_lm += newWords_lm

        devLoss_parser += loss_parser.data.cpu().numpy()    
        devAccuracy_parser += accuracy_parser
        devAccuracyLabeled_parser += accuracyLabeled_parser
        devWords_parser += wordNum_parser


   newDevAccuracy_parser = float(devAccuracy_parser)/devWords_parser
   newDevAccuracyLabeled_parser = float(devAccuracyLabeled_parser)/devWords_parser
   devAccuracies_parser.append(newDevAccuracy_parser)
   devAccuraciesLabeled_parser.append(newDevAccuracyLabeled_parser)

   return devLoss_lm/devWords_lm, devLoss_parser/devWords_parser
Пример #6
0
def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIteratorFuncHead(args.language,
                                               partition,
                                               storeMorph=True).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                vocab_lemmas[line["lemma"]] = vocab_lemmas.get(
                    line["lemma"], 0) + 1

                depsVocab.add(line["dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                for morph in line["morph"]:
                    morphKeyValuePairs.add(morph)
                if line["dep"] == "root":
                    continue

                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = (posHead, dep, posHere)
                keyWithDir = (posHead, dep, posHere, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0
        dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab
        for partition in partitions:
            counter += 1
            printHere = (counter % 50 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass(
                current, train=False)
            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords


while True:
    corpus = CorpusIteratorFuncHead(language).iterator(
        rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpus), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 100 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            loss, baselineLoss, policy_related_loss, _, wordNumInPass, lossWords, lossPOS = doForwardPass(
batchSize = 1

lr_lm = 0.1

crossEntropy = 10.0


def encodeWord(w):
    return stoi[w] + 3 if stoi[w] < vocab_size else 1


import torch.nn.functional

counter = 0
while True:
    corpus = CorpusIterator(args.language, partition="together")
    corpus.permute()
    corpus = corpus.iterator(rejectShortSentences=False)

    for current in corpus:
        if counter > 50000000:
            print("Quitting at counter " + str(counter))
            quit()
        counter += 1
        printHere = (counter % 50 == 0)
        current = [current]
        batchOrdered, logits = orderSentence(current[0], dhLogits, printHere)

        metadata = current[0][1]

        maxLength = len(batchOrdered)
Пример #9
0
assert len(languages) == 51, len(languages)

from corpusIterator_FuncHead import CorpusIteratorFuncHead

counts = {}
directions = {}

for language in languages:  #[:2]:
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIteratorFuncHead(language, partition).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                dep = line["dep"]
                if dep not in counts:
                    counts[dep] = {}
                    directions[dep] = {}
                if language not in counts[dep]:
                    counts[dep][language] = 0
                    directions[dep][language] = 0
                counts[dep][language] += 1
                directions[dep][language] += (
                    1 if line["index"] > line["head"] else -1)  # 1 == DH order
with open("relations.tsv", "w") as outFile:
    for dep in sorted(list(counts)):
        coarse = dep[:dep.index(":")] if ":" in dep else dep
Пример #10
0
words = list(vocab.iteritems())
#print(words)

totalCount = sum(x[1] for x in words)
probs = [float(x[1]) / totalCount for x in words]
unigram_entropy = -sum([x * log(x) for x in probs])
#print(unigram_entropy)

sentenceLengths = []

tree_depth = []
arity = []

numberOfSentences = 0

for sentence in CorpusIteratorFuncHead(language, "train").iterator():
    orderSentence(sentence)
    sentenceLengths.append(len(sentence))
    numberOfSentences += 1
#print(sentenceLengths)
#print(arity)
#print(tree_depth)


def median(x):
    return sorted(x)[int(len(x) / 2)]


def mean(x):
    return float(sum(x)) / len(x)
                    wordNum += 1

       if wordNum > 0:
          crossEntropy = 0.99 * crossEntropy + 0.01 * (totalDepLength/wordNum)
       else:
          assert totalDepLength == 0
       numberOfWords = wordNum
       return (totalDepLength, numberOfWords, byType)



assert batchSize == 1

depLengths = []
if True:
  corpus = CorpusIteratorFuncHead(args.language,"train")
  corpusIterator = corpus.iterator()
  if corpus.length() == 0:
     quit()
  while True:
    try:
       batch = [next(corpusIterator)]
    except StopIteration:
       break
    partitions = range(1)
    
    for partition in partitions:
       counter += 1
       printHere = (counter % 200 == 0)
       current = batch[partition*batchSize:(partition+1)*batchSize]
       if len(current) == 0:
Пример #12
0
    sentCount = 0
    for sentence in corpus:
        sentCount += 1
        if sentCount % 10 == 0:
            print["DEV SENTENCES", sentCount]

        ordered, _ = orderSentence(sentence, dhLogits, sentCount % 500 == 0)

        for line in ordered + ["EOS"]:
            if line == "EOS":
                yield "EOS"
            else:
                yield line["word"]


corpusDev = CorpusIteratorFuncHead(
    args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False)
dev = list(createStreamContinuous(corpusDev))[::-1]

corpusTrain = CorpusIteratorFuncHead(
    args.language, "train",
    storeMorph=True).iterator(rejectShortSentences=False)
train = list(createStreamContinuous(corpusTrain))[::-1]

idev = range(len(dev))
itrain = range(len(train))

idev = sorted(idev, key=lambda i: dev[i:i + 20])
itrain = sorted(itrain, key=lambda i: train[i:i + 20])

print(idev)
Пример #13
0
#        printHere = (counter % 5 == 0)
#        current = batch[partition*batchSize:(partition+1)*batchSize]
#
#        _, _, _, newLoss, newWords = doForwardPass(current)
#        devLoss += newLoss
#        devWords += newWords
#   return devLoss/devWords

#dhGradients_WSurp = deque(maxlen=50000) # * corpus.length())
#distanceGradients_WSurp = deque(maxlen=50000) # * corpus.length())

assert batchSize == 1

depLengths = []
while True:
    corpus = CorpusIteratorFuncHead(language, "train")
    corpusIterator = corpus.iterator()
    if corpus.length() == 0:
        quit()
    while True:
        try:
            batch = map(lambda x: next(corpusIterator), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)

        for partition in partitions:
            counter += 1
            printHere = (counter % 20 == 0)
#        current = batch[partition*batchSize:(partition+1)*batchSize]
#
#        _, _, _, newLoss, newWords = doForwardPass(current)
#        devLoss += newLoss
#        devWords += newWords
#   return devLoss/devWords

#dhGradients_WSurp = deque(maxlen=50000) # * corpus.length())
#distanceGradients_WSurp = deque(maxlen=50000) # * corpus.length())

assert batchSize == 1

depLengths = []
#while True:
if True:
    corpus = CorpusIteratorFuncHead(language, "train")
    corpusIterator = corpus.iterator()
    if corpus.length() == 0:
        quit()
    while True:
        try:
            batch = map(lambda x: next(corpusIterator), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)

        for partition in partitions:
            counter += 1
            printHere = (counter % 100 == 0)
            counter += 1
            printHere = (counter % 50 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass(
                current, train=False)
            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords


while True:
    #  corpus = getNextSentence("train")
    corpus = CorpusIteratorFuncHead(language)
    corpus.permute()
    corpus = corpus.iterator(rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpus), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 100 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]
Пример #16
0
]
languages = sorted(list(set(languages)))
assert len(languages) == 51

sizes = []

with open("../results/corpus-size/corpus-sizes.tsv", "w") as outFile:
    print >> outFile, "\t".join(
        map(str, [
            "language", "sents_train", "sents_dev", "words_train", "words_dev"
        ]))
    from corpusIterator_FuncHead import CorpusIteratorFuncHead
    for language in languages:
        sentsPerPart = {}
        wordsPerPart = {}
        for partition in ["train", "dev"]:
            sentsPerPart[partition] = 0
            wordsPerPart[partition] = 0
            corpus = CorpusIteratorFuncHead(language,
                                            partition=partition).iterator()
            for sentence in corpus:
                sentsPerPart[partition] += 1
                for line in sentence:
                    if line["posUni"] != "PUNCT":
                        wordsPerPart[partition] += 1
        print >> outFile, "\t".join(
            map(str, [
                language, sentsPerPart["train"], sentsPerPart["dev"],
                wordsPerPart["train"], wordsPerPart["dev"]
            ]))
Пример #17
0
            "index", "word", "lemma", "posUni", "posFine", "morph", "head",
            "dep", "_", "_"
        ]

        originalDistanceWeights = {}

        orderTable = {}
        keys = set()
        vocab = {}
        distanceSum = {}
        distanceCounts = {}
        depsVocab = set()
        totalCount = 0
        for partition in ["train", "dev"]:
            for sentence in CorpusIteratorFuncHead(language,
                                                   partition,
                                                   storeMorph=True).iterator():
                for line in sentence:
                    vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                    line["coarse_dep"] = line["dep"][:(line["dep"] +
                                                       ":").index(":")]
                    depsVocab.add(line["coarse_dep"])
                    posFine.add(line["posFine"])
                    posUni.add(line["posUni"])

                    if line["coarse_dep"] == "root":
                        continue
                    posHere = line["posUni"]
                    posHead = sentence[line["head"] - 1]["posUni"]
                    dep = line["coarse_dep"]
                    direction = "HD" if line["head"] < line["index"] else "DH"
Пример #18
0
   quit()

dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True)
distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True)
for i, key in enumerate(itos_pure_deps):
   dhLogits[key] = 0.0
   if key == "obj": 
       dhLogits[key] = (10.0 if posCount < negCount else -10.0)

   dhWeights.data[i] = dhLogits[key]

   originalDistanceWeights[key] = 0.0 #random()  
   distanceWeights.data[i] = originalDistanceWeights[key]


data_train = list(CorpusIteratorFuncHead(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False))
data_dev = list(CorpusIteratorFuncHead(args.language,"dev", storeMorph=True).iterator(rejectShortSentences = False))

words = []

affixFrequency = {}

print(itos_pure_deps)
itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"])
stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps)))))

itos_pure_deps_ = itos_pure_deps[::]
shuffle(itos_pure_deps_)
weights = dict(list(zip(itos_pure_deps_, [2*x for x in range(len(itos_pure_deps_))]))) # abstract slot

#print([[z["coarse_dep"] for z in y] for y in data_dev[:5]])
Пример #19
0
       global counter
       global crossEntropy
       global printHere
       global devLosses
       global baselineAverageLoss
       batchOrderedLogits = zip(*map(lambda (y,x):orderSentence(x, dhLogits, y==0 and printHere), zip(range(len(current)),current)))
      
       batchOrdered = batchOrderedLogits[0]
       logits = batchOrderedLogits[1]
   
       logitCorr = batchOrdered[0][-1]["relevant_logprob_sum"]
       print(logitCorr)
       return float(logitCorr)


corpusDev = CorpusIteratorFuncHead(language,"dev").iterator(rejectShortSentences = True)

totalLikelihood = 0
numberOfSentences = 0

while True:
  try:
     batch = map(lambda x:next(corpusDev), 1*range(batchSize))
  except StopIteration:
     break
  batch = sorted(batch, key=len)
  partitions = range(1)
  shuffle(partitions)
  for partition in partitions:
     counter += 1
     printHere = (counter % 50 == 0)
Пример #20
0
logsoftmax = torch.nn.LogSoftmax()

def deepCopy(sentence):
  result = []
  for w in sentence:
     entry = {}
     for key, value in w.iteritems():
       entry[key] = value
     result.append(entry)
  return result
dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))
distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))

counter = 0
corpus = CorpusIteratorFuncHead(language,"train")

def guide(corpus):
  mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))
  mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))

  sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))
  sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))

  dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)

def model(corpus):
  global counter
  dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
Пример #21
0
# Dirichlet smoothing
for hd in itos_pos_uni:
    roots[hd] = 1.0
    for dp in itos_pos_uni:
        for lr in "lr":
            productions[(hd, dp, lr)] = 1.0
    headCount[hd] = 1.0
orderTable = {}
keys = set()
vocab = {}
distanceSum = {}
distanceCounts = {}
depsVocab = set()
partition = "train"
for sentence in CorpusIteratorFuncHead(language, partition).iterator():
    for line in sentence:
        line["coarse_dep"] = makeCoarse(line["dep"])
        posHere = line["posUni"]
        if line["coarse_dep"] == "root":
            roots[posHere] += 1
            continue
        posHead = sentence[line["head"] - 1]["posUni"]
        dep = line["coarse_dep"]
        direction = "l" if (dhWeights[stoi_deps[dep]] > 0.5) else "r"
        #       direction = "r" if line["head"] < line["index"] else "l"
        productions[(posHead, posHere, direction)] += 1
        headCount[posHead] += 1
print(productions)

totalRootCount = sum([roots[x] for x in roots])