class CorpusIteratorFuncHead():
    def __init__(self,
                 language,
                 partition="train",
                 storeMorph=False,
                 splitLemmas=False):
        self.basis = CorpusIterator(language,
                                    partition=partition,
                                    storeMorph=storeMorph,
                                    splitLemmas=splitLemmas)

    def permute(self):
        self.basis.permute()

    def length(self):
        return self.basis.length()

    def iterator(self, rejectShortSentences=False):
        iterator = self.basis.iterator(
            rejectShortSentences=rejectShortSentences)
        for sentence in iterator:
            reverse_content_head(sentence)
            yield sentence

    def getSentence(self, index):
        return reverse_content_head(self.basis.getSentence(index))
class CorpusIteratorFuncHeadFraction():
    def __init__(self,
                 language,
                 partition="train",
                 fraction=1.0,
                 storeMorph=False,
                 splitLemmas=False):
        self.basis = CorpusIterator(language,
                                    partition=partition,
                                    storeMorph=storeMorph,
                                    splitLemmas=splitLemmas,
                                    shuffleDataSeed=4)
        self.basis.data = self.basis.data[:int(fraction *
                                               len(self.basis.data))]
        self.permute()
        self.fraction = fraction

    def permute(self):
        self.basis.permute()

    def length(self):
        return self.basis.length()

    def iterator(self, rejectShortSentences=False):
        iterator = self.basis.iterator(
            rejectShortSentences=rejectShortSentences)
        counter = 0
        print("Actual length", self.length())
        for sentence in iterator:
            reverse_content_head(sentence)
            yield sentence

    def getSentence(self, index):
        return reverse_content_head(self.basis.getSentence(index))
Пример #3
0
       yield input_indices, wordStartIndices+[len(input_indices)], relevant_logprob_sum
       input_indices = [2] # Start of Segment (makes sure that first word can be predicted from this token)
       wordStartIndices = []





DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator(args.language, storeMorph=True)
while failedDevRuns < args.stopAfterFailures:
  epochCount += 1
  print >> sys.stderr, "Epoch "+str(epochCount)
  print "Starting new epoch, permuting corpus"
  corpusBase.permute()
#  corpus = getNextSentence("train")
  corpus = corpusBase.iterator(rejectShortSentences = False)
  stream = createStream(corpus)


  if counter > 5:
#       if counter % DEV_PERIOD == 0:
          newDevLoss, _ = computeDevLoss()
#             devLosses.append(
          devLosses.append(newDevLoss)


#          newDevLoss = devLosses[-1]-1
 #         print("DON'T STOP don't stop")
            printHere = (counter % 50 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass(
                current, train=False)
            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords


while True:
    #  corpus = getNextSentence("train")
    corpus = CorpusIterator(language)
    corpus.permute()
    corpus = corpus.iterator(rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpus), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 100 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]