class CorpusIteratorFuncHead_V(): def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleDataSeed=None): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=shuffleDataSeed) def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences=False): iterator = self.basis.iterator( rejectShortSentences=rejectShortSentences) for sentence in iterator: reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
class CorpusIteratorFuncHeadFraction_V(): def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))] self.permute() self.fraction = fraction def permute(self): self.basis.permute() def length(self): return self.basis.length() def iterator(self, rejectShortSentences = False): iterator = self.basis.iterator(rejectShortSentences=rejectShortSentences) counter = 0 print("Actual length", self.length()) for sentence in iterator: # if counter > self.fraction * self.length(): # break # counter += 1 reverse_content_head(sentence) yield sentence def getSentence(self, index): return reverse_content_head(self.basis.getSentence(index))
numberOfWords = wordNum return (totalDepLength, numberOfWords, byType) assert batchSize == 1 depLengths = [] #while True: outpath = "/u/scr/mhahn/japanese/"+str(myID) with open(outpath, "w") as outFile: print >> outFile, "\t".join(["Sent", "Length"]) counter = 0 if True: corpus = CorpusIterator_V(language,"train", shuffleDataSeed=40) corpusIterator = corpus.iterator() if corpus.length() == 0: quit() while True: try: batch = map(lambda x:next(corpusIterator), 10*range(batchSize)) except StopIteration: break batch = sorted(batch, key=len) partitions = range(10) for partition in partitions: counter += 1 printHere = (counter % 100 == 0) current = batch[partition*batchSize:(partition+1)*batchSize]
print "Dev examples " + str(devCounter) devCounterTimesBatchSize += devBatchSize devSurprisalTableHere = [ surp / (devCounterTimesBatchSize) for surp in surprisalTable ] return devLoss / devWords, devSurprisalTableHere DEV_PERIOD = 5000 epochCount = 0 corpusBase = CorpusIterator_V(language, storeMorph=True) while failedDevRuns == 0: epochCount += 1 print "Starting new epoch, permuting corpus" corpusBase.permute() # corpus = getNextSentence("train") corpus = corpusBase.iterator(rejectShortSentences=False) stream = createStream(corpus) if counter > 5: # if counter % DEV_PERIOD == 0: newDevLoss, devSurprisalTableHere = computeDevLoss() # devLosses.append( devLosses.append(newDevLoss) print "New dev loss " + str(newDevLoss) + ". previous was: " + str( lastDevLoss) if newDevLoss > 15 or len(devLosses) > 99: print "Abort, training too slow?" devLosses.append(newDevLoss + 0.001) if lastDevLoss is None or newDevLoss < lastDevLoss: devSurprisalTable = devSurprisalTableHere