def computeDevLoss(): global printHere counterDev = 0 corpusDev = CorpusIteratorFuncHead( language, "dev").iterator(rejectShortSentences=True) partitionsDev = getPartitions(corpusDev) devLoss = 0 devAccuracy = 0 devAccuracyLabeled = 0 devWords = 0 for partitionDev in partitionsDev: counterDev += 1 printHere = (counterDev % 500 == 0) loss, _, accuracy, accuracyLabeled, wordNum = forward( partitionDev, computeAccuracy=True, doDropout=False) devLoss += loss.data.cpu().numpy() devAccuracy += accuracy devAccuracyLabeled += accuracyLabeled devWords += wordNum if counterDev % 50 == 0: print "Run on dev " + str(counterDev) print(devLoss / devWords, float(devAccuracy) / devWords, float(devAccuracyLabeled) / devWords, devWords) newDevLoss = devLoss / devWords newDevAccuracy = float(devAccuracy) / devWords newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords devLosses.append(newDevLoss) devAccuracies.append(newDevAccuracy) devAccuraciesLabeled.append(newDevAccuracyLabeled)
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIteratorFuncHead(language,partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"]-1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key,0.0) + 1.0 distanceSum[key] = distanceSum.get(key,0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
def computeDevLoss(): global printHere global counter devLoss_lm = 0.0 devWords_lm = 0 corpusDev = CorpusIteratorFuncHead(args.language,"dev").iterator(rejectShortSentences = True) while True: try: batch = map(lambda x:next(corpusDev), 10*range(args.batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 50 == 0) current = batch[partition*args.batchSize:(partition+1)*args.batchSize] fromLM = doForwardPass(current, train=False) _, _, _, newLoss_lm, newWords_lm = fromLM devLoss_lm += newLoss_lm devWords_lm += newWords_lm return devLoss_lm/devWords_lm
def computeDevLoss(): global printHere global counter devLoss = 0.0 devLossWords = 0.0 devLossPOS = 0.0 devWords = 0 # corpusDev = getNextSentence("dev") corpusDev = CorpusIteratorFuncHead( language, "dev").iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpusDev), range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(1) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 50 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords
def computeDevLoss(): global printHere global counter global devAccuracies_parser global devAccuraciesLabeled_parser devLoss_lm = 0.0 devWords_lm = 0 devLoss_parser = 0.0 devWords_parser = 0 devAccuracy_parser = 0 devAccuracyLabeled_parser = 0 corpusDev = CorpusIteratorFuncHead(args.language,"dev").iterator(rejectShortSentences = True) while True: # iterate through the development set try: batch = map(lambda x:next(corpusDev), 10*range(args.batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 50 == 0) current = batch[partition*args.batchSize:(partition+1)*args.batchSize] # run the model on the syntactic trees fromLM, fromParser = doForwardPass(current, train=False, computeAccuracy_parser=True, doDropout_parser=False) _, _, _, newLoss_lm, newWords_lm = fromLM loss_parser, _, accuracy_parser ,accuracyLabeled_parser, wordNum_parser = fromParser devLoss_lm += newLoss_lm devWords_lm += newWords_lm devLoss_parser += loss_parser.data.cpu().numpy() devAccuracy_parser += accuracy_parser devAccuracyLabeled_parser += accuracyLabeled_parser devWords_parser += wordNum_parser newDevAccuracy_parser = float(devAccuracy_parser)/devWords_parser newDevAccuracyLabeled_parser = float(devAccuracyLabeled_parser)/devWords_parser devAccuracies_parser.append(newDevAccuracy_parser) devAccuraciesLabeled_parser.append(newDevAccuracyLabeled_parser) return devLoss_lm/devWords_lm, devLoss_parser/devWords_parser
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIteratorFuncHead(args.language, partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 vocab_lemmas[line["lemma"]] = vocab_lemmas.get( line["lemma"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) for morph in line["morph"]: morphKeyValuePairs.add(morph) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0 dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
for partition in partitions: counter += 1 printHere = (counter % 50 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords while True: corpus = CorpusIteratorFuncHead(language).iterator( rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] loss, baselineLoss, policy_related_loss, _, wordNumInPass, lossWords, lossPOS = doForwardPass(
batchSize = 1 lr_lm = 0.1 crossEntropy = 10.0 def encodeWord(w): return stoi[w] + 3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator(args.language, partition="together") corpus.permute() corpus = corpus.iterator(rejectShortSentences=False) for current in corpus: if counter > 50000000: print("Quitting at counter " + str(counter)) quit() counter += 1 printHere = (counter % 50 == 0) current = [current] batchOrdered, logits = orderSentence(current[0], dhLogits, printHere) metadata = current[0][1] maxLength = len(batchOrdered)
assert len(languages) == 51, len(languages) from corpusIterator_FuncHead import CorpusIteratorFuncHead counts = {} directions = {} for language in languages: #[:2]: orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIteratorFuncHead(language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 dep = line["dep"] if dep not in counts: counts[dep] = {} directions[dep] = {} if language not in counts[dep]: counts[dep][language] = 0 directions[dep][language] = 0 counts[dep][language] += 1 directions[dep][language] += ( 1 if line["index"] > line["head"] else -1) # 1 == DH order with open("relations.tsv", "w") as outFile: for dep in sorted(list(counts)): coarse = dep[:dep.index(":")] if ":" in dep else dep
words = list(vocab.iteritems()) #print(words) totalCount = sum(x[1] for x in words) probs = [float(x[1]) / totalCount for x in words] unigram_entropy = -sum([x * log(x) for x in probs]) #print(unigram_entropy) sentenceLengths = [] tree_depth = [] arity = [] numberOfSentences = 0 for sentence in CorpusIteratorFuncHead(language, "train").iterator(): orderSentence(sentence) sentenceLengths.append(len(sentence)) numberOfSentences += 1 #print(sentenceLengths) #print(arity) #print(tree_depth) def median(x): return sorted(x)[int(len(x) / 2)] def mean(x): return float(sum(x)) / len(x)
wordNum += 1 if wordNum > 0: crossEntropy = 0.99 * crossEntropy + 0.01 * (totalDepLength/wordNum) else: assert totalDepLength == 0 numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] if True: corpus = CorpusIteratorFuncHead(args.language,"train") corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = [next(corpusIterator)] except StopIteration: break partitions = range(1) for partition in partitions: counter += 1 printHere = (counter % 200 == 0) current = batch[partition*batchSize:(partition+1)*batchSize] if len(current) == 0:
sentCount = 0 for sentence in corpus: sentCount += 1 if sentCount % 10 == 0: print["DEV SENTENCES", sentCount] ordered, _ = orderSentence(sentence, dhLogits, sentCount % 500 == 0) for line in ordered + ["EOS"]: if line == "EOS": yield "EOS" else: yield line["word"] corpusDev = CorpusIteratorFuncHead( args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False) dev = list(createStreamContinuous(corpusDev))[::-1] corpusTrain = CorpusIteratorFuncHead( args.language, "train", storeMorph=True).iterator(rejectShortSentences=False) train = list(createStreamContinuous(corpusTrain))[::-1] idev = range(len(dev)) itrain = range(len(train)) idev = sorted(idev, key=lambda i: dev[i:i + 20]) itrain = sorted(itrain, key=lambda i: train[i:i + 20]) print(idev)
# printHere = (counter % 5 == 0) # current = batch[partition*batchSize:(partition+1)*batchSize] # # _, _, _, newLoss, newWords = doForwardPass(current) # devLoss += newLoss # devWords += newWords # return devLoss/devWords #dhGradients_WSurp = deque(maxlen=50000) # * corpus.length()) #distanceGradients_WSurp = deque(maxlen=50000) # * corpus.length()) assert batchSize == 1 depLengths = [] while True: corpus = CorpusIteratorFuncHead(language, "train") corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x: next(corpusIterator), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 20 == 0)
# current = batch[partition*batchSize:(partition+1)*batchSize] # # _, _, _, newLoss, newWords = doForwardPass(current) # devLoss += newLoss # devWords += newWords # return devLoss/devWords #dhGradients_WSurp = deque(maxlen=50000) # * corpus.length()) #distanceGradients_WSurp = deque(maxlen=50000) # * corpus.length()) assert batchSize == 1 depLengths = [] #while True: if True: corpus = CorpusIteratorFuncHead(language, "train") corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x: next(corpusIterator), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 100 == 0)
counter += 1 printHere = (counter % 50 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords while True: # corpus = getNextSentence("train") corpus = CorpusIteratorFuncHead(language) corpus.permute() corpus = corpus.iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize]
] languages = sorted(list(set(languages))) assert len(languages) == 51 sizes = [] with open("../results/corpus-size/corpus-sizes.tsv", "w") as outFile: print >> outFile, "\t".join( map(str, [ "language", "sents_train", "sents_dev", "words_train", "words_dev" ])) from corpusIterator_FuncHead import CorpusIteratorFuncHead for language in languages: sentsPerPart = {} wordsPerPart = {} for partition in ["train", "dev"]: sentsPerPart[partition] = 0 wordsPerPart[partition] = 0 corpus = CorpusIteratorFuncHead(language, partition=partition).iterator() for sentence in corpus: sentsPerPart[partition] += 1 for line in sentence: if line["posUni"] != "PUNCT": wordsPerPart[partition] += 1 print >> outFile, "\t".join( map(str, [ language, sentsPerPart["train"], sentsPerPart["dev"], wordsPerPart["train"], wordsPerPart["dev"] ]))
"index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_", "_" ] originalDistanceWeights = {} orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() totalCount = 0 for partition in ["train", "dev"]: for sentence in CorpusIteratorFuncHead(language, partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = line["dep"][:(line["dep"] + ":").index(":")] depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH"
quit() dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) for i, key in enumerate(itos_pure_deps): dhLogits[key] = 0.0 if key == "obj": dhLogits[key] = (10.0 if posCount < negCount else -10.0) dhWeights.data[i] = dhLogits[key] originalDistanceWeights[key] = 0.0 #random() distanceWeights.data[i] = originalDistanceWeights[key] data_train = list(CorpusIteratorFuncHead(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False)) data_dev = list(CorpusIteratorFuncHead(args.language,"dev", storeMorph=True).iterator(rejectShortSentences = False)) words = [] affixFrequency = {} print(itos_pure_deps) itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"]) stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps))))) itos_pure_deps_ = itos_pure_deps[::] shuffle(itos_pure_deps_) weights = dict(list(zip(itos_pure_deps_, [2*x for x in range(len(itos_pure_deps_))]))) # abstract slot #print([[z["coarse_dep"] for z in y] for y in data_dev[:5]])
global counter global crossEntropy global printHere global devLosses global baselineAverageLoss batchOrderedLogits = zip(*map(lambda (y,x):orderSentence(x, dhLogits, y==0 and printHere), zip(range(len(current)),current))) batchOrdered = batchOrderedLogits[0] logits = batchOrderedLogits[1] logitCorr = batchOrdered[0][-1]["relevant_logprob_sum"] print(logitCorr) return float(logitCorr) corpusDev = CorpusIteratorFuncHead(language,"dev").iterator(rejectShortSentences = True) totalLikelihood = 0 numberOfSentences = 0 while True: try: batch = map(lambda x:next(corpusDev), 1*range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(1) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 50 == 0)
logsoftmax = torch.nn.LogSoftmax() def deepCopy(sentence): result = [] for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) counter = 0 corpus = CorpusIteratorFuncHead(language,"train") def guide(corpus): mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) def model(corpus): global counter dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
# Dirichlet smoothing for hd in itos_pos_uni: roots[hd] = 1.0 for dp in itos_pos_uni: for lr in "lr": productions[(hd, dp, lr)] = 1.0 headCount[hd] = 1.0 orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() partition = "train" for sentence in CorpusIteratorFuncHead(language, partition).iterator(): for line in sentence: line["coarse_dep"] = makeCoarse(line["dep"]) posHere = line["posUni"] if line["coarse_dep"] == "root": roots[posHere] += 1 continue posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "l" if (dhWeights[stoi_deps[dep]] > 0.5) else "r" # direction = "r" if line["head"] < line["index"] else "l" productions[(posHead, posHere, direction)] += 1 headCount[posHead] += 1 print(productions) totalRootCount = sum([roots[x] for x in roots])