def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() keysCounts = defaultdict(int) for partition in ["together"]: for sentence in CorpusIterator(args.language, partition).iterator(): for line in sentence: if line["word"] == "e_s": continue if line["posUni"] == "-DFL-": continue if line["posUni"] == "XX": continue vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) posFine.add(line["posFine"]) #print(line) processPOS(line["posUni"]) posUni.add(ptb2uni[line["posUni"]]) if line["head"] == None: continue if line["fine_dep"] == "root": continue posHere = ptb2uni[line["posUni"]] if sentence[line["head"] - 1]["posUni"] == "-DFL-": # print("Head is DFL!", line) continue if sentence[line["head"] - 1]["posUni"] == "XX": # print("Head is XX!", line) continue posHead0 = sentence[line["head"] - 1]["posUni"] processPOS(posHead0) posHead = ptb2uni[sentence[line["head"] - 1]["posUni"]] dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep #(posHead, dep, posHere) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) keysCounts[key] += 1 distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleData=shuffleData)
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True): self.basis_train = list( CorpusIterator( language, partition="train", storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleData=False, shuffleDataSeed=5, errorWhenEmpty=False).iterator(rejectShortSentences=False)) self.basis_dev = list( CorpusIterator( language, partition="dev", storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleData=False, shuffleDataSeed=5, errorWhenEmpty=False).iterator(rejectShortSentences=False)) self.basis_test = list( CorpusIterator( language, partition="test", storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleData=False, shuffleDataSeed=5, errorWhenEmpty=False).iterator(rejectShortSentences=False)) self.basis = self.basis_train + self.basis_dev + self.basis_test random.Random(5).shuffle(self.basis) DEV_SIZE = max(100, int(0.05 * len(self.basis))) if partition == "dev": self.basis = self.basis[:DEV_SIZE] else: self.basis = self.basis[DEV_SIZE:]
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() depsVocab.add("root") for partition in ["together"]: for sentence in CorpusIterator(args.language, partition).iterator(): sentenceHash = hash_(" ".join([x["word"] for x in sentence])) for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["dep"] = makeCoarse(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] if line["dep"] == "nsubj": line["dep"] = "nsubj_" + str(sentenceHash) + "_" + str( line["index"]) line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train"]: for sentence in CorpusIterator(args.language, partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 vocab_lemmas[line["lemma"]] = vocab_lemmas.get( line["lemma"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) for morph in line["morph"]: morphKeyValuePairs.add(morph) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0 dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() keysCounts = defaultdict(int) for partition in ["together"]: for sentence in CorpusIterator(args.language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["fine_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) keysCounts[key] += 1 distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit keysCounts = sorted(list(keysCounts.items()), key=lambda x: x[1]) for x in keysCounts[-30:]: print(x) quit() return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["together"]: for sentence in CorpusIterator(language, partition, size=size, shuffleDataSeed=50).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["together"]: for sentence, metadata in CorpusIterator(args.language, partition).iterator(): docs[metadata["newdoc id"]] += 1 for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["fine_dep"] = line["dep"] depsVocab.add(line["fine_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["fine_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
crossEntropy = 10.0 def encodeWord(w): return stoi[w]+3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator("Ancient_Greek_2.6", "together", args.language).iterator() while True: try: batch = map(lambda x:next(corpus), 10*range(1)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: if counter > 200000: print "Quitting at counter "+str(counter) quit() counter += 1 printHere = (counter % 50 == 0)
for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) distanceWeights_Prior = Normal( Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) counter = 0 corpus = CorpusIterator(language, "together", size=size, shuffleDataSeed=50) def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param( "sigma_DH", Variable(torch.FloatTensor([1.0] * len(itos_deps)),
Variable( torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps))), Variable( torch.FloatTensor([1.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps)))) distanceWeights_Prior = Normal( Variable( torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps))), Variable( torch.FloatTensor([1.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps)))) counter = 0 corpus = CorpusIterator(args.language, "train") def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps) * len(docs)).view( len(docs), len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param(
def makeCoarse(x): if ":" in x: return x[:x.index(":")] return x import hashlib def hash_(x): return hashlib.sha224(x).hexdigest() hashToSentence = {} partition = "together" sentences = list(CorpusIterator(args.language, partition).iterator()) def annotateChildren(sentence): for l in sentence: l["children"] = [] for l in sentence: if l["head"] != 0: sentence[l["head"] - 1]["children"].append(l["index"]) def length(i, sentence): if "length" not in sentence[i - 1]: sentence[i - 1]["length"] = 1 + sum( [length(x, sentence) for x in sentence[i - 1]["children"]]) return sentence[i - 1]["length"]
from ud_languages import languages import sys, os version = sys.argv[1] from corpusIterator_V import CorpusIterator_V as CorpusIterator basePath = "/u/scr/corpora/Universal_Dependencies/Universal_Dependencies_" + version + "/ud-treebanks-v" + version + "/" files = os.listdir(basePath) files = sorted(list(set([x[:x.index("-")][3:] + "_2.6" for x in files]))) print(files) languages = set(languages) with open("excluded.tex", "w") as outFile: for language in files: if language not in languages: try: corpus = sorted( list(CorpusIterator(language, "together").iterator())) print >> outFile, (language + " & " + str(len(corpus)) + " & " + str(sum([len(x) for x in corpus])) + "\\\\") except AssertionError: # print(e) continue
for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) distanceWeights_Prior = Normal( Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) counter = 0 corpus = CorpusIterator("Ancient_Greek_2.6", "together", language) def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param( "sigma_DH", Variable(torch.FloatTensor([1.0] * len(itos_deps)),
batchSize = 1 lr_lm = 0.1 crossEntropy = 10.0 def encodeWord(w): return stoi[w] + 3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator(args.language).iterator(rejectShortSentences=True) sentsForThisRound = 0 for sentence, metadata in corpus: try: year = int(metadata["sent_id"].split(".")[0]) except ValueError: assert False, metadata if year < args.start or year >= args.end: continue sentsForThisRound += 1 if sentsForThisRound % 100 == 0: print(sentsForThisRound, "sentences") if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1
lr_lm = 0.1 crossEntropy = 10.0 def encodeWord(w): return stoi[w] + 3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator(args.language, shuffleDataSeed=myID, size=args.size).iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(1)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1
for line in np + ["EOS"]: if line == "EOS": yield ("EOS", MAX_DIST, "EOS", "EOS") else: if line["dep"] in ["amod", "det"]: timeSinceRelevant = 0 else: timeSinceRelevant += 1 yield (line["word"], min(MAX_DIST, timeSinceRelevant), line["posUni"], line["coarse_dep"]) for _ in range(args.cutoff + 2): yield ("PAD", MAX_DIST, "PAD", "PAD") yield ("SOS", MAX_DIST, "SOS", "PAD") corpusDev = CorpusIterator( args.language, "train", storeMorph=True).iterator(rejectShortSentences=False) dev = list(createStreamContinuous(corpusDev))[::-1] #corpusTrain = CorpusIterator(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False) #train = list(createStreamContinuous(corpusTrain))[::-1] train = dev idev = range(len(dev)) itrain = range(len(train)) devW = [x[0] for x in dev] trainW = [x[0] for x in train] idev = sorted(idev, key=lambda i: devW[i:i + 20]) itrain = sorted(itrain, key=lambda i: trainW[i:i + 20])
for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) distanceWeights_Prior = Normal( Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) counter = 0 corpus = CorpusIterator(language, "together", Partition=Partition) def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param( "sigma_DH", Variable(torch.FloatTensor([1.0] * len(itos_deps)),
x[1]: int(x[2]) for x in [y.split("\t") for y in perseusAuthors] } print(perseusAuthors) with open("perseus-docs.txt", "r") as inFile: perseusDocs = inFile.read().strip().split("\n") fromDocToYear = {} for i in range(0, len(perseusDocs), 3): author = perseusDocs[i].replace("=", "").strip() filename = perseusDocs[i + 1] filename = filename[filename.rfind("/") + 1:] print(author, filename, perseusAuthors[author]) fromDocToYear[filename] = perseusAuthors[author] from collections import defaultdict data = list(CorpusIterator("Ancient_Greek_2.6", "together").iterator()) byYear = defaultdict(int) for sent in data: metadata = sent[1] if "source" in metadata: year = 100 if "New Test" in metadata["source"] else ( -450 if "Histories" in metadata["source"] else "NA") byYear[year] += len(sent[0]) elif "sent_id" in metadata: text_id = metadata["sent_id"] text_id = text_id[:text_id.index("@")] # print() byYear[fromDocToYear[text_id]] += len(sent[0]) print(sorted(list(byYear.iteritems())))
def makeCoarse(x): if ":" in x: return x[:x.index(":")] return x import hashlib def hash_(x): return hashlib.sha224(x).hexdigest() hashToSentence = {} for partition in ["together"]: for sentence in CorpusIterator(args.language,partition).iterator(): sentenceHash = hash_(" ".join([x["word"] for x in sentence])) hashToSentence[sentenceHash] = sentence TARGET_DIR = "/u/scr/mhahn/deps/DLM_MEMORY_OPTIMIZED/locality_optimized_dlm/manual_output_funchead_fine_depl_perSent/" import glob from collections import defaultdict orderBySentence = {x:[] for x in hashToSentence} files = glob.glob(TARGET_DIR+"/"+args.language+"*.tsv") for path in files: print(path) with open(path, "r") as inFile: header = next(inFile).strip().split("\t") header = dict(list(zip(header, range(len(header))))) # print >> outFile, "\t".join(map(str,["DH_Weight","CoarseDependency","HeadPOS", "DependentPOS", "DistanceWeight", "Language", "FileName"])) objDir = None
import time import torch.nn.functional counter = 0 from collections import defaultdict totalVocabCount = sum(y for x, y in vocab.iteritems()) surprisalBaseline = { x: -log(y / float(totalVocabCount)) for x, y in vocab.iteritems() } while True: corpus = list( CorpusIterator(args.language).iterator(rejectShortSentences=False)) shuffle(corpus) refreshInterval = int(0.05 * len(corpus)) refreshInterval = min(1000, max(100, refreshInterval)) for sentenceID, sentence in enumerate(corpus): if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1 printHere = (counter % 50 == 0) a = time.time() if sentenceID % refreshInterval == 0: dhWeightsList = [float(x) for x in dhWeights] distanceWeightsList = [float(x) for x in distanceWeights] print("Start linearizing")
dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) distanceWeights_Prior = Normal( Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0] * len(itos_deps)))) def inRange(x): return x >= args.start and x < args.end counter = 0 corpus = [ x for x, metadata in list( CorpusIterator(args.language, "together").iterator()) if inRange(int(metadata["sent_id"].split(".")[0])) ] print(len(corpus), "SENTENCES") def guide(corpus): mu_DH = pyro.param( "mu_DH", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param( "mu_Dist", Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True))
quit() dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) for i, key in enumerate(itos_pure_deps): dhLogits[key] = 0.0 if key == "obj": dhLogits[key] = (10.0 if posCount < negCount else -10.0) dhWeights.data[i] = dhLogits[key] originalDistanceWeights[key] = 0.0 #random() distanceWeights.data[i] = originalDistanceWeights[key] data_train = list(CorpusIterator(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False)) data_dev = list(CorpusIterator(args.language,"dev", storeMorph=True).iterator(rejectShortSentences = False)) words = [] affixFrequency = {} print(itos_pure_deps) itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"]) stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps))))) itos_pure_deps_ = itos_pure_deps[::] shuffle(itos_pure_deps_) weights = dict(list(zip(itos_pure_deps_, [2*x for x in range(len(itos_pure_deps_))]))) # abstract slot def calculateTradeoffForWeights(weights):
lr_lm = 0.1 crossEntropy = 10.0 def encodeWord(w): return stoi[w] + 3 if stoi[w] < vocab_size else 1 import torch.nn.functional counter = 0 while True: corpus = CorpusIterator( args.language, Partition=args.Partition).iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(1)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1
lr=args.lr_grammar, momentum=args.momentum_grammar) optim_amortized = torch.optim.SGD(parameters_amortized(), lr=args.lr_amortized, momentum=args.momentum_amortized) import torch.nn.functional counter = 0 dependencyLengthsLast = 1000 dependencyLengths = [1000] dependencyLengthsPerEpoch = [] for epoch in range(50): corpus = list( CorpusIterator( args.language, partition="together").iterator(rejectShortSentences=True)) shuffle(corpus) dependencyLengthsPerEpoch.append( sum(dependencyLengths) / (0.0 + len(dependencyLengths))) dependencyLengths = [] for sentence in corpus: if counter > 200000: print "Quitting at counter " + str(counter) quit() counter += 1 printHere = (counter % 200 == 0) current = [sentence] assert len(current) == 1 depLength, overallLogprobSum = orderSentence(current[0], dhLogits, printHere)
logsoftmax = torch.nn.LogSoftmax() def deepCopy(sentence): result = [] for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) counter = 0 corpus = CorpusIterator(language,"together") def guide(corpus): mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) def model(corpus): global counter dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)