def initializeOrderTable(): orderTable = {} keys = set() vocab = {} distanceSum = {} distanceCounts = {} depsVocab = set() for partition in ["train", "dev"]: for sentence in CorpusIterator(args.language, partition).iterator(): for line in sentence: vocab[line["word"]] = vocab.get(line["word"], 0) + 1 line["fine_dep"] = line["dep"] depsVocab.add(makeCoarse(line["fine_dep"])) posFine.add(line["posFine"]) posUni.add(line["posUni"]) if line["fine_dep"] == "root": continue posHere = line["posUni"] posHead = sentence[line["head"] - 1]["posUni"] dep = line["fine_dep"] direction = "HD" if line["head"] < line["index"] else "DH" key = (posHead, dep, posHere) keyWithDir = (dep, direction) orderTable[keyWithDir] = orderTable.get(keyWithDir, 0) + 1 keys.add(key) distanceCounts[key] = distanceCounts.get(key, 0.0) + 1.0 distanceSum[key] = distanceSum.get( key, 0.0) + abs(line["index"] - line["head"]) #print orderTable dhLogits = {} for key in keys: hd = orderTable.get((key, "HD"), 0) + 1.0 dh = orderTable.get((key, "DH"), 0) + 1.0 dhLogit = log(dh) - log(hd) dhLogits[key] = dhLogit return dhLogits, vocab, keys, depsVocab
dhWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) distanceWeights = Variable(torch.FloatTensor([0.0] * len(itos_pure_deps)), requires_grad=True) for i, key in enumerate(itos_pure_deps): dhLogits[key] = 0.0 if key == "obj": dhLogits[key] = (10.0 if random() > 0.5 else -10.0) dhWeights.data[i] = dhLogits[key] originalDistanceWeights[key] = 0.0 #random() distanceWeights.data[i] = originalDistanceWeights[key] data_train = list( CorpusIterator(args.language, "train", storeMorph=True).iterator(rejectShortSentences=False)) data_dev = list( CorpusIterator(args.language, "dev", storeMorph=True).iterator(rejectShortSentences=False)) #print(len(data_train), len(data_dev)) #quit() words = [] affixFrequency = {} print(itos_pure_deps) itos_pure_deps = sorted(list(itos_pure_deps) + ["HEAD"]) stoi_pure_deps = dict(list(zip(itos_pure_deps, range(len(itos_pure_deps))))) itos_pure_deps_ = itos_pure_deps[::]
def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleData=True): self.basis = CorpusIterator(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleData=shuffleData)