def __init__(self, language, partition="train", storeMorph=False, splitLemmas=False, shuffleDataSeed=None): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=shuffleDataSeed)
def __init__(self, language, partition="train", fraction=1.0, storeMorph=False, splitLemmas=False): self.basis = CorpusIterator_V(language, partition=partition, storeMorph=storeMorph, splitLemmas=splitLemmas, shuffleDataSeed=4) self.basis.data = self.basis.data[:int(fraction * len(self.basis.data))] self.permute() self.fraction = fraction
def runOnCorpus(): global chart chart = [[ torch.cuda.FloatTensor( [[float("-Inf") for _ in itos_setOfNonterminals] for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY) ] for _ in range(args.MAX_BOUNDARY)] iterator = iterator_dense( CorpusIterator_V(args.language, "dev", shuffleDataSeed=4).iterator()) chunk = [] surprisals = [0 for _ in range(args.MAX_BOUNDARY)] while True: linearized = [] try: for _ in range(args.BATCHSIZE): linearized.append(next(iterator)) except StopIteration: if len(linearized) == 0: break args.BATCHSIZE = len(linearized) chart = [[ torch.cuda.FloatTensor( [[float("-Inf") for _ in itos_setOfNonterminals] for _ in range(args.BATCHSIZE)]) for _ in range(args.MAX_BOUNDARY) ] for _ in range(args.MAX_BOUNDARY)] print( sentCount, [ surprisals[i + 1] - surprisals[i] for i in range(args.MAX_BOUNDARY - 1) ] ) # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)]) computeSurprisals(linearized) surprisals = [ surprisalTableSums[i] / (surprisalTableCounts[i] + 1e-9) for i in range(args.MAX_BOUNDARY) ] print( sentCount, [ surprisals[i + 1] - surprisals[i] for i in range(args.MAX_BOUNDARY - 1) ] ) # [surprisalTableSums[0]/surprisalTableCounts[-1]] + [(surprisalTableSums[i+1]-surprisalTableSums[i])/surprisalTableCounts[-1] for i in range(MAX_BOUNDARY-1)]) return surprisals
assert tree["category"] in leftCornerCounts return leftCorner def linearizeTree2String(tree, sent): if tree["children"] is None: sent.append(tree["word"]) else: for x in tree["children"]: linearizeTree2String(x, sent) sentCount = 0 print("Collecting counts from training corpus") for sentence in CorpusIterator_V(args.language,"train", ignoreCorporaWithoutWords=True).iterator(): sentCount += 1 ordered = orderSentence(sentence, sentCount % 400 == 0) linearized = [] linearizeTree2String(ordered, linearized) # if len(linearized) > 10: # continue # print(ordered) roots[ordered["category"]] = roots.get(ordered["category"], 0) + 1 rootsTotal = rootsTotal + 1 if sentCount % 100 == 0: print(sentCount, ordered["category"])