Python CorpusIterator示例，corpusIterator.CorpusIterator Python示例

示例#1

0

显示文件

def computeDevLoss(order="mixed"):
   devBatchSize = 512
   global printHere
#   global counter
#   global devSurprisalTable
   global horizon
   devLoss = 0.0
   devWords = 0
#   corpusDev = getNextSentence("dev")
   corpusDev = CorpusIterator(language,"dev", storeMorph=True).iterator(rejectShortSentences = False)
   stream = createStreamContinuous(corpusDev, order=order)

   surprisalTable = [0 for _ in range(horizon)]
   devCounter = 0
   devCounterTimesBatchSize = 0
   while True:
#     try:
#        input_indices, wordStartIndices = next(stream)
     try:
        input_indices_list = []
        wordStartIndices_list = []
        for _ in range(devBatchSize):
           input_indices, wordStartIndices = next(stream)
           input_indices_list.append(input_indices)
           wordStartIndices_list.append(wordStartIndices)
     except StopIteration:
        devBatchSize = len(input_indices_list)
#        break
     if devBatchSize == 0:
       break
     devCounter += 1
#     counter += 1
     printHere = (devCounter % 100 == 0)
     _, _, _, newLoss, newWords = doForwardPass(input_indices_list, wordStartIndices_list, surprisalTable = surprisalTable, doDropout=False, batchSizeHere=devBatchSize)
     devLoss += newLoss
     devWords += newWords
     if printHere:
         print "Dev examples "+str(devCounter)
     devCounterTimesBatchSize += devBatchSize
   devSurprisalTableHere = [surp/(devCounterTimesBatchSize) for surp in surprisalTable]
   return devLoss/devWords, devSurprisalTableHere

示例#2

0

显示文件

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator(language, partition).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["coarse_dep"] = makeCoarse(line["dep"])
                depsVocab.add(line["coarse_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["coarse_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["coarse_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                infostruc = line[
                    "infostruc"]  #             assert line[-1] in ["N", "t", "f", "c"] or line.endswith("UNK") or line.endswith("NULL"), line

                key = (dep, infostruc)
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab

示例#3

0

显示文件

文件： yWithMorphologySequentialStreamDropoutDev_Ngrams_Log.py 项目： m-hahn/memory-surprisal

def initializeOrderTable():
   orderTable = {}
   keys = set()
   vocab = {}
   distanceSum = {}
   distanceCounts = {}
   depsVocab = set()
   for partition in ["train", "dev"]:
     for sentence in CorpusIterator(args.language,partition, storeMorph=True).iterator():
      for line in sentence:
          vocab[line["word"]] = vocab.get(line["word"], 0) + 1
          vocab_lemmas[line["lemma"]] = vocab_lemmas.get(line["lemma"], 0) + 1

          depsVocab.add(line["dep"])
          posFine.add(line["posFine"])
          posUni.add(line["posUni"])
  
          for morph in line["morph"]:
              morphKeyValuePairs.add(morph)
          if line["dep"] == "root":
             continue

          posHere = line["posUni"]
          posHead = sentence[line["head"]-1]["posUni"]
          dep = line["dep"]
          direction = "HD" if line["head"] < line["index"] else "DH"
          key = (posHead, dep, posHere)
          keyWithDir = (posHead, dep, posHere, direction)
          orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
          keys.add(key)
          distanceCounts[key] = distanceCounts.get(key,0.0) + 1.0
          distanceSum[key] = distanceSum.get(key,0.0) + abs(line["index"] - line["head"])
   dhLogits = {}
   for key in keys:
      hd = orderTable.get((key[0], key[1], key[2], "HD"), 0) + 1.0
      dh = orderTable.get((key[0], key[1], key[2], "DH"), 0) + 1.0
      dhLogit = log(dh) - log(hd)
      dhLogits[key] = dhLogit
      originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
   return dhLogits, vocab, keys, depsVocab

示例#4

0

显示文件

文件： optimizeOrdersForI1_A3.py 项目： m-hahn/memory-surprisal

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator(args.language, partition).iterator():
            sentHash = hashlib.sha224(" ".join([x["word"] for x in sentence
                                                ])).hexdigest()
            for sentNum, line in enumerate(sentence):
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["coarse_dep"] = makeCoarse(line["dep"])
                depsVocab.add(line["coarse_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["coarse_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["coarse_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = sentHash + "_" + str(sentNum)
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
        originalDistanceWeights[key] = (distanceSum[key] / distanceCounts[key])
    return dhLogits, vocab, keys, depsVocab

示例#5

0

显示文件

def computeDevLoss():
    devBatchSize = 32
    global printHere
    devLoss = 0.0
    devWords = 0
    corpusDev = CorpusIterator(args.language,
                               "dev").iterator(rejectShortSentences=False)
    stream = createStream(corpusDev, training=False)

    surprisalTable = [0 for _ in range(2)]
    devCounter = 0
    devCounterTimesBatchSize = 0
    while True:
        try:
            input_indices_list = []
            wordStartIndices_list = []
            for _ in range(devBatchSize):
                input_indices, wordStartIndices, _ = next(stream)
                input_indices_list.append(input_indices)
                wordStartIndices_list.append(wordStartIndices)
        except StopIteration:
            devBatchSize = len(input_indices_list)
        if devBatchSize == 0:
            break
        devCounter += 1
        printHere = (devCounter % 100 == 0)
        with torch.no_grad():
            _, _, _, newLoss, newWords = doForwardPass(
                input_indices_list,
                wordStartIndices_list,
                surprisalTable=surprisalTable,
                doDropout=False,
                batchSizeHere=devBatchSize,
                relevant_logprob_sum=None)
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
        devCounterTimesBatchSize += devBatchSize
    return devLoss / devWords, None  #devSurprisalTableHere

示例#6

0

显示文件

文件： estimateParseability_PureUD.py 项目： m-hahn/grammar-optim

def computeDevLoss():
    global printHere
    counterDev = 0
    corpusDev = CorpusIterator(language,
                               "dev").iterator(rejectShortSentences=True)
    partitionsDev = getPartitions(corpusDev)
    devLossU = 0
    devLossL = 0
    devAccuracy = 0
    devAccuracyLabeled = 0
    devWords = 0
    for partitionDev in partitionsDev:
        counterDev += 1
        printHere = (counterDev % 500 == 0)
        lossU, lossL, _, accuracy, accuracyLabeled, wordNum = forward(
            partitionDev, computeAccuracy=True, doDropout=False)
        devLossU += lossU.data.cpu().numpy()
        devLossL += lossL.data.cpu().numpy()

        devAccuracy += accuracy
        devAccuracyLabeled += accuracyLabeled
        devWords += wordNum
        if counterDev % 50 == 0:
            print "Run on dev " + str(counterDev)
            print(devLossU / devWords, devLossL / devWords,
                  float(devAccuracy) / devWords,
                  float(devAccuracyLabeled) / devWords, devWords)

    newDevLossL = devLossL / devWords
    newDevLossU = devLossU / devWords

    newDevAccuracy = float(devAccuracy) / devWords
    newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords
    devLossesL.append(newDevLossL)
    devLossesU.append(newDevLossU)

    devAccuracies.append(newDevAccuracy)
    devAccuraciesLabeled.append(newDevAccuracyLabeled)

示例#7

0

显示文件

文件： optimizePredictability.py 项目： m-hahn/memory-surprisal

def initializeOrderTable():
    orderTable = {}
    keys = set()
    vocab = {}
    distanceSum = {}
    distanceCounts = {}
    depsVocab = set()
    for partition in ["train", "dev"]:
        for sentence in CorpusIterator(args.language, partition).iterator():
            for line in sentence:
                vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                line["coarse_dep"] = makeCoarse(line["dep"])
                depsVocab.add(line["coarse_dep"])
                posFine.add(line["posFine"])
                posUni.add(line["posUni"])

                if line["coarse_dep"] == "root":
                    continue
                posHere = line["posUni"]
                posHead = sentence[line["head"] - 1]["posUni"]
                dep = line["coarse_dep"]
                direction = "HD" if line["head"] < line["index"] else "DH"
                key = dep
                keyWithDir = (dep, direction)
                orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1
                keys.add(key)
                distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0
                distanceSum[key] = distanceSum.get(
                    key, 0.0) + abs(line["index"] - line["head"])
    #print orderTable
    dhLogits = {}
    for key in keys:
        hd = orderTable.get((key, "HD"), 0) + 1.0
        dh = orderTable.get((key, "DH"), 0) + 1.0
        dhLogit = log(dh) - log(hd)
        dhLogits[key] = dhLogit
    return dhLogits, vocab, keys, depsVocab

示例#8

0

显示文件

            counter += 1
            printHere = (counter % 1000 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPassEvaluate(
                current, train=False)

            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords


if True:
    corpus = CorpusIterator(args.language).iterator()

    while True:
        try:
            batch = [next(corpus)]
        except StopIteration:
            break
        partitions = range(1)
        for partition in partitions:
            counter += 1
            printHere = (counter % 1000 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            doForwardPassTrain(current)
    #print(bigramCounts)

示例#9

0

显示文件

                        ])
                    wordNum += 1

    if wordNum > 0:
        crossEntropy = 0.99 * crossEntropy + 0.01 * (totalDepLength / wordNum)
    else:
        assert totalDepLength == 0
    numberOfWords = wordNum
    return (totalDepLength, numberOfWords, byType)


assert batchSize == 1

depLengths = []
if True:
    corpus = CorpusIterator(args.language, "train")
    corpusIterator = corpus.iterator()
    if corpus.length() == 0:
        quit()
    while True:
        try:
            batch = [next(corpusIterator)]
        except StopIteration:
            break
        partitions = range(1)

        for partition in partitions:
            counter += 1
            printHere = (counter % 200 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]
            if len(current) == 0:

示例#10

0

显示文件

文件： branching_direction_entropy.py 项目： m-hahn/memory-surprisal

            "index", "word", "lemma", "posUni", "posFine", "morph", "head",
            "dep", "_", "_"
        ]

        originalDistanceWeights = {}

        orderTable = {}
        keys = set()
        vocab = {}
        distanceSum = {}
        distanceCounts = {}
        depsVocab = set()
        totalCount = 0
        for partition in ["train", "dev"]:
            for sentence in CorpusIterator(language,
                                           partition,
                                           storeMorph=True).iterator():
                for line in sentence:
                    vocab[line["word"]] = vocab.get(line["word"], 0) + 1
                    depsVocab.add(line["dep"])
                    posFine.add(line["posFine"])
                    posUni.add(line["posUni"])

                    if line["dep"] == "root":
                        continue
                    posHere = line["posUni"]
                    posHead = sentence[line["head"] - 1]["posUni"]
                    dep = line["dep"]
                    direction = "HD" if line["head"] < line["index"] else "DH"
                    key = (posHead, dep, posHere)
                    keyWithDir = (posHead, dep, posHere, direction)

示例#11

0

显示文件

            batchSizeHere=devBatchSize)
        devLoss += newLoss
        devWords += newWords
        if printHere:
            print "Dev examples " + str(devCounter)
        devCounterTimesBatchSize += devBatchSize
    devSurprisalTableHere = [
        surp / (devCounterTimesBatchSize) for surp in surprisalTable
    ]
    return devLoss / devWords, devSurprisalTableHere


DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator(language,
                            storeMorph=True,
                            trainSize=trainingSize,
                            devSize=devSize)
while failedDevRuns == 0:
    epochCount += 1
    print "Starting new epoch, permuting corpus"
    corpus = corpusBase.iterator(rejectShortSentences=False)
    stream = createStream(corpus)

    if counter > 5:
        newDevLoss, devSurprisalTableHere = computeDevLoss()
        devLosses.append(newDevLoss)
        print "New dev loss " + str(newDevLoss) + ". previous was: " + str(
            lastDevLoss)
        if newDevLoss > 15 or len(devLosses) > 99:
            print "Abort, training too slow?"
            devLosses.append(newDevLoss + 0.001)

示例#12

0

显示文件

def genderTest(mode):
    training = CorpusIterator("German",
                              partition="train",
                              storeMorph=True,
                              removePunctuation=True)
    genders = dict([("Gender=" + x, set()) for x in ["Masc", "Fem", "Neut"]])

    for sentence in training.iterator():
        for line in sentence:
            if line["posUni"] == "NOUN" and "|" not in line["lemma"]:

                morph = line["morph"]
                if "Number=Sing" in morph and "Case=Nom" in morph:
                    gender = [x for x in morph if x.startswith("Gender=")]
                    if len(gender) > 0:
                        genders[gender[0]].add(line["lemma"].lower())

    for gender in genders:
        print(
            "OOV Ratio for ", gender,
            sum([0 if x in stoi else 1
                 for x in genders[gender]]) / len(genders[gender]))

    #print(genders)
    counter = 0

    results = [[0, 0, 0] for _ in range(3)]
    for genderIndex, gender in enumerate(
        ["Gender=" + x for x in ["Masc", "Fem", "Neut"]]):
        with open(f"stimuli/german-gender-{gender}-{mode}-noOOVs.txt",
                  "w") as outFile:
            counter = 0
            for noun in genders[gender]:
                if noun not in stoi:
                    continue
                counter += 1
                #    adverbs = ["sehr"]
                #   adjective = "" #"".join(adverbs)+random.choice(adjectives)+"e"
                chosenAdjective = "_NONE_"
                while chosenAdjective not in stoi:
                    chosenAdjective = random.choice(adjectives) + "e"

                if mode == "nothing":
                    noun = noun
                    nounStimulus = [noun]
                elif mode == "adjective":
                    adjective = chosenAdjective
                    nounStimulus = [adjective, noun]
                    noun = adjective + noun
                elif mode == "sehr + adjective":
                    adjective = chosenAdjective
                    nounStimulus = ["sehr", adjective, noun]
                    noun = "sehr" + adjective + noun
                elif mode == "sehr + extrem + adjective":
                    adjective = chosenAdjective
                    nounStimulus = ["sehr", "extrem", adjective, noun]
                    noun = "sehr" + "extrem" + adjective + noun

                stimuli = []
                print(" ".join(["der"] + nounStimulus), file=outFile)
                print(" ".join(["die"] + nounStimulus), file=outFile)
                print(" ".join(["das"] + nounStimulus), file=outFile)

                results[genderIndex][doChoiceList(
                    [
                        f". der " + " ".join(nounStimulus) + " .",
                        f". die " + " ".join(nounStimulus) + " .",
                        f". das " + " ".join(nounStimulus) + " ."
                    ],
                    printHere=(random.random() > 0.98))] += 1
                if random.random() > 0.98:
                    print([[
                        round(x / (counter if genderIndex == i else 1), 2)
                        for x in results[i]
                    ] for i in range(len(results))])
            results[genderIndex] = [x / counter for x in results[genderIndex]]
    return results

示例#13

0

显示文件

文件： forWords_CelexSyllables_EvenOdd.py 项目： m-hahn/memory-surprisal

        if args.model == "REAL_REAL":
            WORD2 = WORD
        elif args.model == "EVEN_ODD":
            WORDA = WORD[::2]
            WORDB = WORD[1::2]
            WORD2 = WORDA + WORDB
            assert len(WORD2) == len(WORD)
        elif args.model == "SORTED":  # not invertible
            WORD2 = "".join(sorted(list(WORD)))
        for x in WORD2:
            yield x
        for _ in range(args.cutoff + 2):
            yield "EOW"


corpusDev = CorpusIterator(
    args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False)

words = []
with open("/u/scr/corpora/ldc/1996/LDC96L14/english/epl/epl.cd",
          "r") as inFile:
    for line in inFile:
        line = line.strip().split("\\")
        orth = line[1]
        syll = line[5].replace("'", "").split("-")
        print(orth, syll)  #, line)

        if args.model == "REAL_REAL":
            WORD2 = "".join(syll)
        elif args.model == "EVEN_ODD":
            syllA = syll[::2]
            syllB = syll[1::2]

示例#14

0

显示文件

parser.add_argument("--hidden_dim", type=int, default=1024)
parser.add_argument("--layer_num", type=int, default=1)
parser.add_argument("--weight_dropout_in", type=float, default=0.01)
parser.add_argument("--weight_dropout_hidden", type=float, default=0.1)
parser.add_argument("--char_dropout_prob", type=float, default=0.33)
parser.add_argument("--char_noise_prob", type=float, default=0.01)
parser.add_argument("--learning_rate", type=float, default=0.1)
parser.add_argument("--myID", type=int, default=random.randint(0, 1000000000))
parser.add_argument("--sequence_length", type=int, default=50)

args = parser.parse_args()
print(args)

from corpusIterator import CorpusIterator
training = CorpusIterator(args.language,
                          partition="train",
                          storeMorph=False,
                          removePunctuation=True)
dev = CorpusIterator(args.language,
                     partition="dev",
                     storeMorph=False,
                     removePunctuation=True)


def plus(it1, it2):
    for x in it1:
        yield x
    for x in it2:
        yield x


try:

示例#15

0

显示文件

文件： corpusIterator_Hybrid.py 项目： m-hahn/memory-surprisal

 def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False):
    self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas)

示例#16

0

显示文件

文件： oce_ba_pos.py 项目： m-hahn/forgetting-model

assert args.beta <= 1.0

import random
import sys

header = [
    "index", "word", "lemma", "posUni", "posFine", "morph", "head", "dep", "_",
    "_"
]

from corpusIterator import CorpusIterator

ngrams = {}

lastPosUni = ("EOS", ) * (2 * args.horizon - 1)
for sentence in CorpusIterator(args.language, "train",
                               storeMorph=True).iterator():
    for line in sentence:
        nextPosUni = line["posUni"]
        ngram = lastPosUni + (nextPosUni, )
        ngrams[ngram] = ngrams.get(ngram, 0) + 1
        lastPosUni = lastPosUni[1:] + (nextPosUni, )
    nextPosUni = "EOS"
    ngram = lastPosUni + (nextPosUni, )
    ngrams[ngram] = ngrams.get(ngram, 0) + 1
    lastPosUni = lastPosUni[1:] + (nextPosUni, )

#import torch.distributions
import torch.nn as nn
import torch
from torch.autograd import Variable

示例#17

0

显示文件

文件： corpusSizes.py 项目： m-hahn/memory-surprisal

# ./python27 corpusSizes.py > corpusSizes.tsv

# Result extracted from tex file:
# c(1315,974,21864,514,926,5396,788,8907,808,550,13123,3997,7689,102993,4383,18310,17062,1450,6959,1108,27198,32347,13814,1662,5241,13304,910,4477,17427,7164,947,27410,634,4124,1123,848,2257,29870,4798,6100,17995,8664,52664,2935,8483,7532,28492,7041,900,3685,4506,4043,1656,1400)

from corpusIterator import CorpusIterator

from ud_languages import languages

with open("../corpusSizes.tsv", "w") as outFile:
    print >> outFile, ("Language\tTrainingSents\tHeldoutSents")
    for language in languages:
        train = [
            x for x in CorpusIterator(language, "train",
                                      storeMorph=False).iterator()
        ]
        heldout = [
            x for x in CorpusIterator(language, "dev",
                                      storeMorph=False).iterator()
        ]

        print >> outFile, (language + "\t" + str(len(train)) + "\t" +
                           str(len(heldout)))

示例#18

0

显示文件

文件： extractNPs.py 项目： m-hahn/memory-surprisal


crossEntropy = 10.0
counter = 0
lastDevLoss = None
failedDevRuns = 0
devLosses = [] 








corpusTrain = CorpusIterator(args.language,"train", storeMorph=True).iterator(rejectShortSentences = False)


count = 0
for s in corpusTrain:
    count += 1
    if count % 100 == 0:
       print(count)
    orderSentence(s, dhLogits, False)


def toCounts(l):
   c = {}
   for x in l:
     c[x] = c.get(x,0)+1.0
   return c

示例#19

0

显示文件

文件： corpusIterator_Hybrid.py 项目： m-hahn/memory-surprisal

 def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False):
    self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4)
    self.basis.data = self.basis.data[:int(fraction*len(self.basis.data))]
    self.permute()
    self.fraction = fraction

示例#20

0

显示文件

文件： testLeftRightEntUniHDCond3FilterMIWord5_Content_PlainUD_Bugfix.py 项目： m-hahn/left-right-asymmetries

            counter += 1
            printHere = (counter % 50 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords, lossWords, lossPOS = doForwardPass(
                current, train=False)
            devLoss += newLoss
            devWords += newWords
            devLossWords += lossWords
            devLossPOS += lossPOS
    return devLoss / devWords, devLossWords / devWords, devLossPOS / devWords


while True:
    #  corpus = getNextSentence("train")
    corpus = CorpusIterator(language)
    corpus.permute()
    corpus = corpus.iterator(rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpus), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 100 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

示例#21

0

显示文件

logsoftmax = torch.nn.LogSoftmax()

def deepCopy(sentence):
  result = []
  for w in sentence:
     entry = {}
     for key, value in w.iteritems():
       entry[key] = value
     result.append(entry)
  return result
dhWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))
distanceWeights_Prior = Normal(Variable(torch.FloatTensor([0.0] * len(itos_deps))), Variable(torch.FloatTensor([1.0]* len(itos_deps))))

counter = 0
corpus = CorpusIterator(language,"train")

def guide(corpus):
  mu_DH = pyro.param("mu_DH", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))
  mu_Dist = pyro.param("mu_Dist", Variable(torch.FloatTensor([0.0]*len(itos_deps)), requires_grad=True))

  sigma_DH = pyro.param("sigma_DH", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))
  sigma_Dist = pyro.param("sigma_Dist", Variable(torch.FloatTensor([1.0]*len(itos_deps)), requires_grad=True))

  dhWeights = pyro.sample("dhWeights", dist.Normal(mu_DH, sigma_DH)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", dist.Normal(mu_Dist, sigma_Dist)) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)

def model(corpus):
  global counter
  dhWeights = pyro.sample("dhWeights", dhWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)
  distanceWeights = pyro.sample("distanceWeights", distanceWeights_Prior) #Variable(torch.FloatTensor([0.0] * len(itos_deps)), requires_grad=True)

示例#22

0

显示文件

文件： optimizeBigramFromOld_POS.py 项目： m-hahn/memory-surprisal

        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 5 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            _, _, _, newLoss, newWords = doForwardPass(current)
            devLoss += newLoss
            devWords += newWords
    return devLoss / devWords


while True:
    #  corpus = getNextSentence("train")
    corpus = CorpusIterator(args.language).iterator(rejectShortSentences=True)

    while True:
        try:
            batch = map(lambda x: next(corpus), 10 * range(batchSize))
        except StopIteration:
            break
        batch = sorted(batch, key=len)
        partitions = range(10)
        shuffle(partitions)
        for partition in partitions:
            counter += 1
            printHere = (counter % 20 == 0)
            current = batch[partition * batchSize:(partition + 1) * batchSize]

            loss, baselineLoss, policy_related_loss, _, wordNumInPass = doForwardPass(

示例#23

0

显示文件

    for line in inFile:
        line = line.strip().split(" ")
        if len(line) != 3:
            continue
        if line[1] != "ADJA":
            if wentThroughAdjectives:
                continue
        else:
            wentThroughAdjectives = True
        if int(line[0]) > 100 and not line[2].endswith("r"):
            adjectives.append(line[2])

print(len(adjectives))

from corpusIterator import CorpusIterator
data = CorpusIterator("German", partition="train",
                      removePunctuation=False).iterator()
frames = []
for sentence in data:
    mits = []
    for word in sentence:
        if word["lemma"] == "mit" and word["posUni"] == "ADP" and word[
                "dep"] == "case":
            head = word["head"] - 1
            if head < 0:
                continue
            if sentence[head]["posUni"] not in ["NOUN", "PROPN"]:
                continue
            mits.append(word)
    if len(mits) > 0:
        #    print(len(mits))
        mit = random.choice(mits)

示例#24

0

显示文件

文件： 10-dlm.py 项目： m-hahn/japanese-sov

    else:
        assert totalDepLength == 0
    numberOfWords = wordNum
    return (totalDepLength, numberOfWords, byType)


assert batchSize == 1

depLengths = []
#while True:
outpath = "/u/scr/mhahn/japanese/" + str(myID)
with open(outpath, "w") as outFile:
    print >> outFile, "\t".join(["Sent", "Length"])
    counter = 0
    if True:
        corpus = CorpusIterator(language, "train", shuffleDataSeed=40)
        corpusIterator = corpus.iterator()
        if corpus.length() == 0:
            quit()
        while True:
            try:
                batch = map(lambda x: next(corpusIterator),
                            10 * range(batchSize))
            except StopIteration:
                break
            batch = sorted(batch, key=len)
            partitions = range(10)

            for partition in partitions:
                counter += 1
                printHere = (counter % 100 == 0)

示例#25

0

显示文件

            if targetWord >= vocab_size:
               input_indices.append(stoi_pos_uni[line["posUni"]]+3)
            else:
               input_indices.append(targetWord+3+len(itos_pos_uni))

       yield input_indices, wordStartIndices+[len(input_indices)], relevant_logprob_sum
       input_indices = [2] # Start of Segment (makes sure that first word can be predicted from this token)
       wordStartIndices = []





DEV_PERIOD = 5000
epochCount = 0
corpusBase = CorpusIterator(args.language, storeMorph=True)
while failedDevRuns < args.stopAfterFailures:
  epochCount += 1
  print >> sys.stderr, "Epoch "+str(epochCount)
  print "Starting new epoch, permuting corpus"
  corpusBase.permute()
#  corpus = getNextSentence("train")
  corpus = corpusBase.iterator(rejectShortSentences = False)
  stream = createStream(corpus)


  if counter > 5:
#       if counter % DEV_PERIOD == 0:
          newDevLoss, _ = computeDevLoss()
#             devLosses.append(
          devLosses.append(newDevLoss)

示例#26

0

显示文件

文件： computeLikelihood_PureUD.py 项目： m-hahn/grammar-optim

    global printHere
    global devLosses
    global baselineAverageLoss
    batchOrderedLogits = zip(
        *map(lambda (y, x): orderSentence(x, dhLogits, y == 0 and printHere),
             zip(range(len(current)), current)))

    batchOrdered = batchOrderedLogits[0]
    logits = batchOrderedLogits[1]

    logitCorr = batchOrdered[0][-1]["relevant_logprob_sum"]
    print(logitCorr)
    return float(logitCorr)


corpusDev = CorpusIterator(language, "dev").iterator(rejectShortSentences=True)

totalLikelihood = 0
numberOfSentences = 0

while True:
    try:
        batch = map(lambda x: next(corpusDev), 1 * range(batchSize))
    except StopIteration:
        break
    batch = sorted(batch, key=len)
    partitions = range(1)
    shuffle(partitions)
    for partition in partitions:
        counter += 1
        printHere = (counter % 50 == 0)

示例#27

0

显示文件

文件： char-lm-ud-wiki-classify-boundaries-nouns_verbs.py 项目： m-hahn/probing-char-lms

args=parser.parse_args()
print(args)


assert args.language == "german"


import corpusIteratorWiki



from corpusIterator import CorpusIterator

if True:
   training = CorpusIterator("German", partition="train", storeMorph=True, removePunctuation=True)
   vocabulary = {"NOUN" : set(), "VERB" : set()}
   for sentence in training.iterator():
       for line in sentence:
        if line["posUni"] in vocabulary:
          vocabulary[line["posUni"]].add(line["word"].lower())
#   print(vocabulary)
#quit()
#genderTest()

vocabulary["NOUN"] = vocabulary["NOUN"].difference(vocabulary["VERB"])
vocabulary["VERB"] = vocabulary["VERB"].difference(vocabulary["NOUN"])

示例#28

0

显示文件

         print "Dev examples "+str(devCounter)
     devCounterTimesBatchSize += devBatchSize
   devSurprisalTableHere = [surp/(devCounterTimesBatchSize) for surp in surprisalTable]
   return devLoss/devWords, devSurprisalTableHere







depLengths = 0
depsNum = 0

for EPOCH in range(30):
 corpusDev = CorpusIterator(language,"train", storeMorph=True).iterator(rejectShortSentences = False)
 SENT = 0
 for sentence in corpusDev:
   ordereds = orderSentence(sentence, dhLogits, False)
   for ordered in ordereds:
      SENT += 1
      #print(list(ordered))
      loss = doForwardPass(sentence, ordered)
      doBackwardPass(loss)
# ['relativeClause', 'betweenAndHead', 'after', 'before']



quit()

DEV_PERIOD = 5000

示例#29

0

显示文件

文件： testLeftRightEntUniHDCond3Word.py 项目： m-hahn/left-right-asymmetries

myID = random.randint(0,10000000)



deps = ["acl", "acl:relcl", "advcl", "advmod", "amod", "appos", "aux", "auxpass", "case", "cc", "ccomp", "compound", "compound:prt", "conj", "conj:preconj", "cop", "csubj", "csubjpass", "dep", "det", "det:predet", "discourse", "dobj", "expl", "foreign", "goeswith", "iobj", "list", "mark", "mwe", "neg", "nmod", "nmod:npmod", "nmod:poss", "nmod:tmod", "nsubj", "nsubjpass", "nummod", "parataxis", "punct", "remnant", "reparandum", "root", "vocative", "xcomp"] 

#deps = ["acl", " advcl", " advmod", " amod", " appos", " aux", " case cc", " ccompclf", " compound", " conj", " cop", " csubjdep", " det", " discourse", " dislocated", " expl", " fixed", " flat", " goeswith", " iobj", " list", " mark", " nmod", " nsubj", " nummod", " obj", " obl", " orphan", " parataxis", " punct", " reparandum", " root", " vocative", " xcomp"]


from math import log, exp
from random import shuffle


from corpusIterator import CorpusIterator

corpus = CorpusIterator(language)
devSet = CorpusIterator(language,"dev")


leftCounts = {}
rightCounts = {}
conditionalCountsLR = {}
conditionalCountsRL = {}


def register(x, y, counts):
  if x not in counts:
    counts[x] = {'_TOTAL_' : 0}
  counts[x][y] = counts[x].get(y,0.0) + 1.0
  counts[x]['_TOTAL_'] = counts[x]['_TOTAL_'] + 1.0

示例#30

0

显示文件

文件： estimateParseability_PureUD.py 项目： m-hahn/grammar-optim

    newDevLossL = devLossL / devWords
    newDevLossU = devLossU / devWords

    newDevAccuracy = float(devAccuracy) / devWords
    newDevAccuracyLabeled = float(devAccuracyLabeled) / devWords
    devLossesL.append(newDevLossL)
    devLossesU.append(newDevLossU)

    devAccuracies.append(newDevAccuracy)
    devAccuraciesLabeled.append(newDevAccuracyLabeled)


counter = 0
epochs = 0
while True:
    corpus = CorpusIterator(language,
                            "train").iterator(rejectShortSentences=True)
    partitions = getPartitions(corpus)
    epochs += 1
    for partition in partitions:
        if counter > maxNumberOfUpdates:
            print "Ran for a long time, quitting."
            quit()

        counter += 1
        printHere = (counter % 100 == 0)
        _, loss, policyLoss, _, _, wordNum = forward(partition)
        if wordNum == 0:
            assert loss is 0
        else:
            backward(loss, policyLoss)
        if printHere: