class CorpusIteratorFuncHead_V():
    def __init__(self,
                 language,
                 partition="train",
                 storeMorph=False,
                 splitLemmas=False,
                 shuffleDataSeed=None):
        self.basis = CorpusIterator_V(language,
                                      partition=partition,
                                      storeMorph=storeMorph,
                                      splitLemmas=splitLemmas,
                                      shuffleDataSeed=shuffleDataSeed)

    def permute(self):
        self.basis.permute()

    def length(self):
        return self.basis.length()

    def iterator(self, rejectShortSentences=False):
        iterator = self.basis.iterator(
            rejectShortSentences=rejectShortSentences)
        for sentence in iterator:
            reverse_content_head(sentence)
            yield sentence

    def getSentence(self, index):
        return reverse_content_head(self.basis.getSentence(index))
class CorpusIteratorFuncHeadFraction_V():
   def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False):
      self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4)
      self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))]
      self.permute()
      self.fraction = fraction
   def permute(self):
      self.basis.permute()
   def length(self):
      return self.basis.length()
   def iterator(self, rejectShortSentences = False):
     iterator = self.basis.iterator(rejectShortSentences=rejectShortSentences)
     counter = 0
     print("Actual length", self.length())
     for sentence in iterator:
#         if counter > self.fraction * self.length():
 #           break
  #          counter += 1
         reverse_content_head(sentence)
         yield sentence
   def getSentence(self, index):
      return reverse_content_head(self.basis.getSentence(index))
Exemplo n.º 3
0
       numberOfWords = wordNum
       return (totalDepLength, numberOfWords, byType)



assert batchSize == 1

depLengths = []
#while True:
outpath = "/u/scr/mhahn/japanese/"+str(myID)
with open(outpath, "w") as outFile:
 print >> outFile, "\t".join(["Sent", "Length"])
 counter = 0
 if True:
   corpus = CorpusIterator_V(language,"train", shuffleDataSeed=40)
   corpusIterator = corpus.iterator()
   if corpus.length() == 0:
      quit()
   while True:
     try:
        batch = map(lambda x:next(corpusIterator), 10*range(batchSize))
     except StopIteration:
        break
     batch = sorted(batch, key=len)
     partitions = range(10)
     
     for partition in partitions:
        counter += 1
        printHere = (counter % 100 == 0)
        current = batch[partition*batchSize:(partition+1)*batchSize]
 
Exemplo n.º 4
0
            print "Dev examples " + str(devCounter)
        devCounterTimesBatchSize += devBatchSize
    devSurprisalTableHere = [
        surp / (devCounterTimesBatchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere

DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator_V(language, storeMorph=True)
while failedDevRuns == 0:
    epochCount += 1
    print "Starting new epoch, permuting corpus"
    corpusBase.permute()
    #  corpus = getNextSentence("train")
    corpus = corpusBase.iterator(rejectShortSentences=False)
    stream = createStream(corpus)

    if counter > 5:
        #       if counter % DEV_PERIOD == 0:
        newDevLoss, devSurprisalTableHere = computeDevLoss()
        #             devLosses.append(
        devLosses.append(newDevLoss)
        print "New dev loss " + str(newDevLoss) + ". previous was: " + str(
            lastDevLoss)
        if newDevLoss > 15 or len(devLosses) > 99:
            print "Abort, training too slow?"
            devLosses.append(newDevLoss + 0.001)

        if lastDevLoss is None or newDevLoss < lastDevLoss:
            devSurprisalTable = devSurprisalTableHere