def computeDevLoss(order="mixed"): devBatchSize = 512 global printHere # global counter # global devSurprisalTable global horizon devLoss = 0.0 devWords = 0 # corpusDev = getNextSentence("dev") corpusDev = CorpusIterator(language,"dev", storeMorph=True).iterator(rejectShortSentences = False) stream = createStreamContinuous(corpusDev, order=order) surprisalTable = [0 for _ in range(horizon)] devCounter = 0 devCounterTimesBatchSize = 0 while True: # try: # input_indices, wordStartIndices = next(stream) try: input_indices_list = [] wordStartIndices_list = [] for _ in range(devBatchSize): input_indices, wordStartIndices = next(stream) input_indices_list.append(input_indices) wordStartIndices_list.append(wordStartIndices) except StopIteration: devBatchSize = len(input_indices_list) # break if devBatchSize == 0: break devCounter += 1 # counter += 1 printHere = (devCounter % 100 == 0) _, _, _, newLoss, newWords = doForwardPass(input_indices_list, wordStartIndices_list, surprisalTable = surprisalTable, doDropout=False, batchSizeHere=devBatchSize) devLoss += newLoss devWords += newWords if printHere: print "Dev examples "+str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [surp/(devCounterTimesBatchSize) for surp in surprisalTable] return devLoss/devWords, devSurprisalTableHere
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" infostruc = line[ "infostruc"] # assert line[-1] in ["N", "t", "f", "c"] or line.endswith("UNK") or line.endswith("NULL"), line key = (dep, infostruc) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language,partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 vocab_lemmas[line["lemma"]] = vocab_lemmas.get(line["lemma"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) for morph in line["morph"]: morphKeyValuePairs.add(morph) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"]-1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key,0.0) + 1.0 distanceSum[key] = distanceSum.get(key,0.0) + abs(line["index"] - line["head"]) dhLogits = {} for key in keys: hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0 dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language, partition).iterator(): sentHash = hashlib.sha224(" ".join([x["word"] for x in sentence ])).hexdigest() for sentNum, line in enumerate(sentence): vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = sentHash + "_" + str(sentNum) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key]) return dhLogits, vocab, keys, depsVocab
def computeDevLoss(): devBatchSize = 32 global printHere devLoss = 0.0 devWords = 0 corpusDev = CorpusIterator(args.language, "dev").iterator(rejectShortSentences=False) stream = createStream(corpusDev, training=False) surprisalTable = [0 for _ in range(2)] devCounter = 0 devCounterTimesBatchSize = 0 while True: try: input_indices_list = [] wordStartIndices_list = [] for _ in range(devBatchSize): input_indices, wordStartIndices, _ = next(stream) input_indices_list.append(input_indices) wordStartIndices_list.append(wordStartIndices) except StopIteration: devBatchSize = len(input_indices_list) if devBatchSize == 0: break devCounter += 1 printHere = (devCounter % 100 == 0) with torch.no_grad(): _, _, _, newLoss, newWords = doForwardPass( input_indices_list, wordStartIndices_list, surprisalTable=surprisalTable, doDropout=False, batchSizeHere=devBatchSize, relevant_logprob_sum=None) devLoss += newLoss devWords += newWords if printHere: print "Dev examples " + str(devCounter) devCounterTimesBatchSize += devBatchSize return devLoss / devWords, None #devSurprisalTableHere
def computeDevLoss(): global printHere counterDev = 0 corpusDev = CorpusIterator(language, "dev").iterator(rejectShortSentences=True) partitionsDev = getPartitions(corpusDev) devLossU = 0 devLossL = 0 devAccuracy = 0 devAccuracyLabeled = 0 devWords = 0 for partitionDev in partitionsDev: counterDev += 1 printHere = (counterDev % 500 == 0) lossU, lossL, _, accuracy, accuracyLabeled, wordNum = forward( partitionDev, computeAccuracy=True, doDropout=False) devLossU += lossU.data.cpu().numpy() devLossL += lossL.data.cpu().numpy() devAccuracy += accuracy devAccuracyLabeled += accuracyLabeled devWords += wordNum if counterDev % 50 == 0: print "Run on dev " + str(counterDev) print(devLossU / devWords, devLossL / devWords, float(devAccuracy) / devWords, float(devAccuracyLabeled) / devWords, devWords) newDevLossL = devLossL / devWords newDevLossU = devLossU / devWords newDevAccuracy = float(devAccuracy) / devWords newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords devLossesL.append(newDevLossL) devLossesU.append(newDevLossU) devAccuracies.append(newDevAccuracy) devAccuraciesLabeled.append(newDevAccuracyLabeled)
def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["coarse_dep"] = makeCoarse(line["dep"]) depsVocab.add(line["coarse_dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["coarse_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["coarse_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = dep keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
counter += 1 printHere = (counter % 1000 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPassEvaluate( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords if True: corpus = CorpusIterator(args.language).iterator() while True: try: batch = [next(corpus)] except StopIteration: break partitions = range(1) for partition in partitions: counter += 1 printHere = (counter % 1000 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] doForwardPassTrain(current) #print(bigramCounts)
]) wordNum += 1 if wordNum > 0: crossEntropy = 0.99 * crossEntropy + 0.01 * (totalDepLength / wordNum) else: assert totalDepLength == 0 numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] if True: corpus = CorpusIterator(args.language, "train") corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = [next(corpusIterator)] except StopIteration: break partitions = range(1) for partition in partitions: counter += 1 printHere = (counter % 200 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] if len(current) == 0:
"index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_", "_" ] originalDistanceWeights = {} orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() totalCount = 0 for partition in ["train", "dev"]: for sentence in CorpusIterator(language, partition, storeMorph=True).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 depsVocab.add(line["dep"]) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (posHead, dep, posHere, direction)
batchSizeHere=devBatchSize) devLoss += newLoss devWords += newWords if printHere: print "Dev examples " + str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [ surp / (devCounterTimesBatchSize) for surp in surprisalTable ] return devLoss / devWords, devSurprisalTableHere DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator(language, storeMorph=True, trainSize=trainingSize, devSize=devSize) while failedDevRuns == 0: epochCount += 1 print "Starting new epoch, permuting corpus" corpus = corpusBase.iterator(rejectShortSentences=False) stream = createStream(corpus) if counter > 5: newDevLoss, devSurprisalTableHere = computeDevLoss() devLosses.append(newDevLoss) print "New dev loss " + str(newDevLoss) + ". previous was: " + str( lastDevLoss) if newDevLoss > 15 or len(devLosses) > 99: print "Abort, training too slow?" devLosses.append(newDevLoss + 0.001)
def genderTest(mode): training = CorpusIterator("German", partition="train", storeMorph=True, removePunctuation=True) genders = dict([("Gender=" + x, set()) for x in ["Masc", "Fem", "Neut"]]) for sentence in training.iterator(): for line in sentence: if line["posUni"] == "NOUN" and "|" not in line["lemma"]: morph = line["morph"] if "Number=Sing" in morph and "Case=Nom" in morph: gender = [x for x in morph if x.startswith("Gender=")] if len(gender) > 0: genders[gender[0]].add(line["lemma"].lower()) for gender in genders: print( "OOV Ratio for ", gender, sum([0 if x in stoi else 1 for x in genders[gender]]) / len(genders[gender])) #print(genders) counter = 0 results = [[0, 0, 0] for _ in range(3)] for genderIndex, gender in enumerate( ["Gender=" + x for x in ["Masc", "Fem", "Neut"]]): with open(f"stimuli/german-gender-{gender}-{mode}-noOOVs.txt", "w") as outFile: counter = 0 for noun in genders[gender]: if noun not in stoi: continue counter += 1 # adverbs = ["sehr"] # adjective = "" #"".join(adverbs)+random.choice(adjectives)+"e" chosenAdjective = "_NONE_" while chosenAdjective not in stoi: chosenAdjective = random.choice(adjectives) + "e" if mode == "nothing": noun = noun nounStimulus = [noun] elif mode == "adjective": adjective = chosenAdjective nounStimulus = [adjective, noun] noun = adjective + noun elif mode == "sehr + adjective": adjective = chosenAdjective nounStimulus = ["sehr", adjective, noun] noun = "sehr" + adjective + noun elif mode == "sehr + extrem + adjective": adjective = chosenAdjective nounStimulus = ["sehr", "extrem", adjective, noun] noun = "sehr" + "extrem" + adjective + noun stimuli = [] print(" ".join(["der"] + nounStimulus), file=outFile) print(" ".join(["die"] + nounStimulus), file=outFile) print(" ".join(["das"] + nounStimulus), file=outFile) results[genderIndex][doChoiceList( [ f". der " + " ".join(nounStimulus) + " .", f". die " + " ".join(nounStimulus) + " .", f". das " + " ".join(nounStimulus) + " ." ], printHere=(random.random() > 0.98))] += 1 if random.random() > 0.98: print([[ round(x / (counter if genderIndex == i else 1), 2) for x in results[i] ] for i in range(len(results))]) results[genderIndex] = [x / counter for x in results[genderIndex]] return results
if args.model == "REAL_REAL": WORD2 = WORD elif args.model == "EVEN_ODD": WORDA = WORD[::2] WORDB = WORD[1::2] WORD2 = WORDA + WORDB assert len(WORD2) == len(WORD) elif args.model == "SORTED": # not invertible WORD2 = "".join(sorted(list(WORD))) for x in WORD2: yield x for _ in range(args.cutoff + 2): yield "EOW" corpusDev = CorpusIterator( args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False) words = [] with open("/u/scr/corpora/ldc/1996/LDC96L14/english/epl/epl.cd", "r") as inFile: for line in inFile: line = line.strip().split("\\") orth = line[1] syll = line[5].replace("'", "").split("-") print(orth, syll) #, line) if args.model == "REAL_REAL": WORD2 = "".join(syll) elif args.model == "EVEN_ODD": syllA = syll[::2] syllB = syll[1::2]
parser.add_argument("--hidden_dim", type=int, default=1024) parser.add_argument("--layer_num", type=int, default=1) parser.add_argument("--weight_dropout_in", type=float, default=0.01) parser.add_argument("--weight_dropout_hidden", type=float, default=0.1) parser.add_argument("--char_dropout_prob", type=float, default=0.33) parser.add_argument("--char_noise_prob", type=float, default=0.01) parser.add_argument("--learning_rate", type=float, default=0.1) parser.add_argument("--myID", type=int, default=random.randint(0, 1000000000)) parser.add_argument("--sequence_length", type=int, default=50) args = parser.parse_args() print(args) from corpusIterator import CorpusIterator training = CorpusIterator(args.language, partition="train", storeMorph=False, removePunctuation=True) dev = CorpusIterator(args.language, partition="dev", storeMorph=False, removePunctuation=True) def plus(it1, it2): for x in it1: yield x for x in it2: yield x try:
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas)
assert args.beta <= 1.0 import random import sys header = [ "index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_", "_" ] from corpusIterator import CorpusIterator ngrams = {} lastPosUni = ("EOS", ) * (2 * args.horizon - 1) for sentence in CorpusIterator(args.language, "train", storeMorph=True).iterator(): for line in sentence: nextPosUni = line["posUni"] ngram = lastPosUni + (nextPosUni, ) ngrams[ngram] = ngrams.get(ngram, 0) + 1 lastPosUni = lastPosUni[1:] + (nextPosUni, ) nextPosUni = "EOS" ngram = lastPosUni + (nextPosUni, ) ngrams[ngram] = ngrams.get(ngram, 0) + 1 lastPosUni = lastPosUni[1:] + (nextPosUni, ) #import torch.distributions import torch.nn as nn import torch from torch.autograd import Variable
# ./python27 corpusSizes.py > corpusSizes.tsv # Result extracted from tex file: # c(1315,974,21864,514,926,5396,788,8907,808,550,13123,3997,7689,102993,4383,18310,17062,1450,6959,1108,27198,32347,13814,1662,5241,13304,910,4477,17427,7164,947,27410,634,4124,1123,848,2257,29870,4798,6100,17995,8664,52664,2935,8483,7532,28492,7041,900,3685,4506,4043,1656,1400) from corpusIterator import CorpusIterator from ud_languages import languages with open("../corpusSizes.tsv", "w") as outFile: print >> outFile, ("Language\tTrainingSents\tHeldoutSents") for language in languages: train = [ x for x in CorpusIterator(language, "train", storeMorph=False).iterator() ] heldout = [ x for x in CorpusIterator(language, "dev", storeMorph=False).iterator() ] print >> outFile, (language + "\t" + str(len(train)) + "\t" + str(len(heldout)))
crossEntropy = 10.0 counter = 0 lastDevLoss = None failedDevRuns = 0 devLosses = [] corpusTrain = CorpusIterator(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False) count = 0 for s in corpusTrain: count += 1 if count % 100 == 0: print(count) orderSentence(s, dhLogits, False) def toCounts(l): c = {} for x in l: c[x] = c.get(x,0)+1.0 return c
def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))] self.permute() self.fraction = fraction
counter += 1 printHere = (counter % 50 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass( current, train=False) devLoss += newLoss devWords += newWords devLossWords += lossWords devLossPOS += lossPOS return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords while True: # corpus = getNextSentence("train") corpus = CorpusIterator(language) corpus.permute() corpus = corpus.iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize]
logsoftmax = torch.nn.LogSoftmax() def deepCopy(sentence): result = [] for w in sentence: entry = {} for key, value in w.iteritems(): entry[key] = value result.append(entry) return result dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps)))) counter = 0 corpus = CorpusIterator(language,"train") def guide(corpus): mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True)) sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True)) dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) def model(corpus): global counter dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True) distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 5 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] _, _, _, newLoss, newWords = doForwardPass(current) devLoss += newLoss devWords += newWords return devLoss / devWords while True: # corpus = getNextSentence("train") corpus = CorpusIterator(args.language).iterator(rejectShortSentences=True) while True: try: batch = map(lambda x: next(corpus), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 20 == 0) current = batch[partition * batchSize:(partition + 1) * batchSize] loss, baselineLoss, policy_related_loss, _, wordNumInPass = doForwardPass(
for line in inFile: line = line.strip().split(" ") if len(line) != 3: continue if line[1] != "ADJA": if wentThroughAdjectives: continue else: wentThroughAdjectives = True if int(line[0]) > 100 and not line[2].endswith("r"): adjectives.append(line[2]) print(len(adjectives)) from corpusIterator import CorpusIterator data = CorpusIterator("German", partition="train", removePunctuation=False).iterator() frames = [] for sentence in data: mits = [] for word in sentence: if word["lemma"] == "mit" and word["posUni"] == "ADP" and word[ "dep"] == "case": head = word["head"] - 1 if head < 0: continue if sentence[head]["posUni"] not in ["NOUN", "PROPN"]: continue mits.append(word) if len(mits) > 0: # print(len(mits)) mit = random.choice(mits)
else: assert totalDepLength == 0 numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] #while True: outpath = "/u/scr/mhahn/japanese/" + str(myID) with open(outpath, "w") as outFile: print >> outFile, "\t".join(["Sent", "Length"]) counter = 0 if True: corpus = CorpusIterator(language, "train", shuffleDataSeed=40) corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x: next(corpusIterator), 10 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) for partition in partitions: counter += 1 printHere = (counter % 100 == 0)
if targetWord >= vocab_size: input_indices.append(stoi_pos_uni[line["posUni"]]+3) else: input_indices.append(targetWord+3+len(itos_pos_uni)) yield input_indices, wordStartIndices+[len(input_indices)], relevant_logprob_sum input_indices = [2] # Start of Segment (makes sure that first word can be predicted from this token) wordStartIndices = [] DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator(args.language, storeMorph=True) while failedDevRuns < args.stopAfterFailures: epochCount += 1 print >> sys.stderr, "Epoch "+str(epochCount) print "Starting new epoch, permuting corpus" corpusBase.permute() # corpus = getNextSentence("train") corpus = corpusBase.iterator(rejectShortSentences = False) stream = createStream(corpus) if counter > 5: # if counter % DEV_PERIOD == 0: newDevLoss, _ = computeDevLoss() # devLosses.append( devLosses.append(newDevLoss)
global printHere global devLosses global baselineAverageLoss batchOrderedLogits = zip( *map(lambda (y, x): orderSentence(x, dhLogits, y == 0 and printHere), zip(range(len(current)), current))) batchOrdered = batchOrderedLogits[0] logits = batchOrderedLogits[1] logitCorr = batchOrdered[0][-1]["relevant_logprob_sum"] print(logitCorr) return float(logitCorr) corpusDev = CorpusIterator(language, "dev").iterator(rejectShortSentences=True) totalLikelihood = 0 numberOfSentences = 0 while True: try: batch = map(lambda x: next(corpusDev), 1 * range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(1) shuffle(partitions) for partition in partitions: counter += 1 printHere = (counter % 50 == 0)
args=parser.parse_args() print(args) assert args.language == "german" import corpusIteratorWiki from corpusIterator import CorpusIterator if True: training = CorpusIterator("German", partition="train", storeMorph=True, removePunctuation=True) vocabulary = {"NOUN" : set(), "VERB" : set()} for sentence in training.iterator(): for line in sentence: if line["posUni"] in vocabulary: vocabulary[line["posUni"]].add(line["word"].lower()) # print(vocabulary) #quit() #genderTest() vocabulary["NOUN"] = vocabulary["NOUN"].difference(vocabulary["VERB"]) vocabulary["VERB"] = vocabulary["VERB"].difference(vocabulary["NOUN"])
print "Dev examples "+str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [surp/(devCounterTimesBatchSize) for surp in surprisalTable] return devLoss/devWords, devSurprisalTableHere depLengths = 0 depsNum = 0 for EPOCH in range(30): corpusDev = CorpusIterator(language,"train", storeMorph=True).iterator(rejectShortSentences = False) SENT = 0 for sentence in corpusDev: ordereds = orderSentence(sentence, dhLogits, False) for ordered in ordereds: SENT += 1 #print(list(ordered)) loss = doForwardPass(sentence, ordered) doBackwardPass(loss) # ['relativeClause', 'betweenAndHead', 'after', 'before'] quit() DEV_PERIOD = 5000
myID = random.randint(0,10000000) deps = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "ccomp", "compound", "compound:prt", "conj", "conj:preconj", "cop", "csubj", "csubjpass", "dep", "det", "det:predet", "discourse", "dobj", "expl", "foreign", "goeswith", "iobj", "list", "mark", "mwe", "neg", "nmod", "nmod:npmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "remnant", "reparandum", "root", "vocative", "xcomp"] #deps = ["acl", " advcl", " advmod", " amod", " appos", " aux", " case cc", " ccompclf", " compound", " conj", " cop", " csubjdep", " det", " discourse", " dislocated", " expl", " fixed", " flat", " goeswith", " iobj", " list", " mark", " nmod", " nsubj", " nummod", " obj", " obl", " orphan", " parataxis", " punct", " reparandum", " root", " vocative", " xcomp"] from math import log, exp from random import shuffle from corpusIterator import CorpusIterator corpus = CorpusIterator(language) devSet = CorpusIterator(language,"dev") leftCounts = {} rightCounts = {} conditionalCountsLR = {} conditionalCountsRL = {} def register(x, y, counts): if x not in counts: counts[x] = {'_TOTAL_' : 0} counts[x][y] = counts[x].get(y,0.0) + 1.0 counts[x]['_TOTAL_'] = counts[x]['_TOTAL_'] + 1.0
newDevLossL = devLossL / devWords newDevLossU = devLossU / devWords newDevAccuracy = float(devAccuracy) / devWords newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords devLossesL.append(newDevLossL) devLossesU.append(newDevLossU) devAccuracies.append(newDevAccuracy) devAccuraciesLabeled.append(newDevAccuracyLabeled) counter = 0 epochs = 0 while True: corpus = CorpusIterator(language, "train").iterator(rejectShortSentences=True) partitions = getPartitions(corpus) epochs += 1 for partition in partitions: if counter > maxNumberOfUpdates: print "Ran for a long time, quitting." quit() counter += 1 printHere = (counter % 100 == 0) _, loss, policyLoss, _, _, wordNum = forward(partition) if wordNum == 0: assert loss is 0 else: backward(loss, policyLoss) if printHere: