def oracle_trace(self, document): assert len(document.gold) > 0, "No gold actions" state = ParserState(document, self) for gold in document.gold: print("Taking gold action", gold) print("On state:", state) gold_index = self.actions.indices.get(gold, None) assert gold_index is not None, "Unknown gold action: %r" % gold assert state.is_allowed(gold_index), "Disallowed gold action: %r" % gold state.advance(gold) print("Final state after", len(document.gold), "actions:", state)
def __main__(): trainingCorpus = ParsedConllFile(keepMalformed=False, projectivize=True) trainingCorpus.read( \ open(trainingFile, 'r', encoding='utf-8').read()) # make fake model params, enough for lexicon builder # we still need feature_maps to use ParserState modelParams = ModelParameters('') modelParams.trainingFile = trainingFile modelParams.cfg = {'projectivizeTrainingSet': True} lexicon = Lexicon(modelParams) lexicon.compute() sentence = trainingCorpus.sentences[0] parser_state = ParserState(sentence, lexicon.getFeatureMaps()) # necessary for initializing and pushing root # (only initialize transition_state_class once!) # keep arc_state in sync with parser_state arc_state = transition_state_class(parser_state) dynamicOracleTrainTest(parser_state)
def test_parse(): """Simple tests for the PartialParse.parse function. Warning: these are not exhaustive. """ sentence = [ Token(i + 1, f) for i, f in enumerate(["parse", "this", "sentence"]) ] state = ParserState(stack=[ROOT], buffer=sentence) dependencies = state.parse(["S", "S", "S", "LA", "RA", "RA"]) dependencies = [(a[0].form, a[1].form) for a in sorted(dependencies)] expected = [('ROOT', 'parse'), ('parse', 'sentence'), ('sentence', 'this')] assert dependencies == expected, \ f"parse test resulted in dependencies {dependencies}, expected {expected}" assert [t.form for t in sentence] == ["parse", "this", "sentence"], \ f"parse test failed: the input sentence should not be modified" print("parse test passed!")
def parse(self, sentences, model, conllu=False): """ @param sentences: a list of (list Token). @param model: a trained parser model. @param conllu: if True prints the parsed sentences in CoNLL-U format. """ vsentences = self.vectorize(sentences) UAS = LAS = all_tokens = 0.0 for sent, vsent in zip(sentences, vsentences): if not conllu: print('.', end='') # show progress state = ParserState([self.root_token], vsent, []) # FIXME while state.buffer or len(state.stack) > 1: feats = state.extract_features(self) trans = model.predict([feats])[0].argmax() if not state.step(trans): break # if transition is not feasible if conllu: for j, t in enumerate(sent): head = deprel = 0 for arc in state.arcs: if arc[1].id == t.id: head = arc[0].id deprel = arc[2] break print('\t'.join([ str(j + 1), t.form, '_', t.pos, '_', '_', str(head), self.id2dep[deprel], '_', '_' ])) print() for arc in state.arcs: pred_h = arc[0].id gold_h = arc[1].head UAS += pred_h == gold_h pred_l = arc[2] gold_l = arc[1].deprel LAS += pred_h == gold_h and pred_l == gold_l all_tokens += 1 UAS /= all_tokens LAS /= all_tokens return UAS, LAS
def advanceSentence(self, i): self.logger.debug('Slot(%d): advance sentence' % i) assert i >= 0 and i < self.batch_size if (self.sentence_batch.advanceSentence(i)): self.parser_states[i] = ParserState( self.sentence_batch.sentence(i), self.feature_maps) # necessary for initializing and pushing root # keep arc_states in sync with parser_states self.arc_states[i] = \ self.transition_state_class(self.parser_states[i]) else: self.parser_states[i] = None self.arc_states[i] = None
def advanceSentence(self, i): assert i >= 0 and i < self.batch_size if (self.sentence_batch.advanceSentence(i)): self.parser_states[i] = ParserState( self.sentence_batch.sentence(i), self.feature_maps) # necessary for initializing and pushing root # keep arc_states in sync with parser_states self.arc_states[i] = \ self.transition_state_class(self.parser_states[i]) else: self.parser_states[i] = None self.arc_states[i] = None if self.state(i) != None: self.docids_.insert(0, self.state(i).sentence().docid())
def create_features(self, sentences): """ Build training instances. @return: list(features), list(action) for each state while parsing each sentence """ train_x, train_y = [], [] with tqdm(total=len(sentences)) as prog: for sent in sentences: # arcs = [(head, dependent, deprel)] state = ParserState([self.root_token], sent, []) # FIXME while state.buffer or len(state.stack) > 1: gold_t = state.get_oracle() if gold_t is None: break train_x.append(state.extract_features(self)) train_y.append(gold_t) state.step(gold_t) # perform transition prog.update(1) return train_x, train_y
def forward(self, document, train=False, debug=False): # Compute LSTM outputs for all tokens. lr_out, rl_out, lstm_features = self._lstm_outputs(document) # Run FF unit. state = ParserState(document, self.spec) actions = self.spec.actions cascade = self.spec.cascade ff_activations = [] if train: losses = Losses() # Translate the gold actions into their cascade equivalents. cascade_gold = cascade.translate(document.gold) gold_index = 0 while not state.done: # Compute the hidden layer once for all cascade delegates. ff_activation, _ = self._ff_activation( lr_out, rl_out, ff_activations, state) cascading = True delegate_index = 0 # assume we start the cascade at delegate 0 while cascading: # Get the gold action for the delegate and compute loss w.r.t. it. gold = cascade_gold[gold_index] step_loss = cascade.loss(delegate_index, state, ff_activation, gold) losses.add(delegate_index, step_loss) # If the gold action was a CASCADE, move to the next delegate. if gold.is_cascade(): delegate_index = gold.delegate else: state.advance(gold) cascading = False gold_index += 1 return losses else: if document.size() == 0: return state shift = actions.action(actions.shift()) stop = actions.action(actions.stop()) disallowed_counts = [0] * cascade.size() total_counts = [0] * cascade.size() trace = Trace(self.spec, state, lstm_features) if debug else None while not state.done: # Compute the FF activation once for all cascade delegates. ff_activation, ff_features = self._ff_activation( lr_out, rl_out, ff_activations, state, debug=debug) if trace: trace.start_step(state, ff_features) # Store the last CASCADE action in a cascade. delegate_index = 0 last = None while True: # Get the highest scoring action from the cascade delegate. # Note: We don't have to do any filtering or checking here, we # can just return the top-scoring action. best = cascade.predict(delegate_index, state, last, ff_activation) final = best if best.is_cascade(): delegate_index = best.delegate last = best else: # If the action isn't allowed or can't be applied to the state, # then default to SHIFT or STOP. index = actions.index(best) total_counts[delegate_index] += 1 if actions.disallowed[index] or not state.is_allowed(index): disallowed_counts[delegate_index] += 1 final = shift if state.current == state.end: final = stop if trace: trace.action(best, final) if not final.is_cascade(): # Apply the action and stop the cascade. state.advance(final) break return state, disallowed_counts, total_counts, trace
def setupParser(self, mode): hiddenLayerSizes = self.modelParams.cfg['hiddenLayerSizes'] featureStrings = self.modelParams.cfg['featureStrings'] embeddingSizes = self.modelParams.cfg['embeddingSizes'] batchSize = self.modelParams.cfg['batchSize'] transitionSystem = self.modelParams.cfg['transitionSystem'] if transitionSystem == 'arc-standard': self.transitionSystem = ArcStandardTransitionSystem() elif transitionSystem == 'arc-eager': self.transitionSystem = ArcEagerTransitionSystem() else: assert None, 'transition system must be arc-standard or arc-eager' assert len(hiddenLayerSizes) > 0, 'must have at least one hidden layer' assert len(featureStrings) == len(set(featureStrings)), \ 'duplicate feature string detected' if mode == 'train': # determine if we have to compute or read the lexicon self.logger.info('Computing lexicon from training corpus...') self.modelParams.lexicon.compute() self.logger.info('Done building lexicon') self.modelParams.lexicon.write() elif mode == 'evaluate': self.logger.info('Reading lexicon from trained model...') self.modelParams.lexicon.read() else: assert None, 'invalid mode: ' + mode self.featureMaps = self.modelParams.lexicon.getFeatureMaps() self.logger.info('Feature strings: ' + str(featureStrings)) # Get major type groups in sorted order by contructing null parser # state and extracting features, and then concatenating the similar # types fvec = SparseFeatureExtractor(featureStrings, self.featureMaps) \ .extract(ParserState(ParsedConllSentence(docid=None), self.featureMaps), doLogging=False) featureTypeInstances = fvec.types self.featureMajorTypeGroups, _ = fvec.concatenateSimilarTypes() # index: major feature type index # values: feature names under that type self.featureNames = [[] for t in self.featureMajorTypeGroups] self.logger.info('Detected major feature groups (in alphabetical ' 'order): ' + str(self.featureMajorTypeGroups)) self.featureDomainSizes = [] #self.featureEmbeddings = [] # For now, use all same embedding sizes self.featureEmbeddingSizes = \ [embeddingSizes[t] for t in self.featureMajorTypeGroups] self.BAG_OF_FEATURES_LEN = 0 for i in range(len(featureTypeInstances)): major_type = featureTypeInstances[i].major_type major_type_index = self.featureMajorTypeGroups.index(major_type) self.featureNames[major_type_index].append( featureTypeInstances[i].name) self.BAG_OF_FEATURES_LEN += \ (self.featureEmbeddingSizes[major_type_index]) for i in range(len(self.featureMajorTypeGroups)): major_type = self.featureMajorTypeGroups[i] self.logger.info('') self.logger.info('Feature group \'%s\'' % major_type) self.logger.info('... domain size: %d' % \ (self.featureMaps[major_type].getDomainSize( \ includeSpecial=True))) self.logger.info('... embedding size: %d' % \ (self.featureEmbeddingSizes[i])) #self.logger.info('... feature count: %d' % \ # (len(self.featureNames[i]))) self.logger.info('... features') for fname in self.featureNames[i]: self.logger.info('....... %s' % (fname)) self.logger.info('... total group embedding size: %d' % \ (len(self.featureNames[i]) * self.featureEmbeddingSizes[i])) self.logger.info('... initializing random normal embeddings...') self.featureDomainSizes.append( self.featureMaps[major_type].getDomainSize( \ includeSpecial=True)) assert len(self.featureDomainSizes) == len(self.featureEmbeddingSizes) #assert len(self.featureDomainSizes) == len(self.featureEmbeddings) assert len(self.featureDomainSizes) == len(self.featureNames) self.logger.info('') self.logger.info('Batch size (number of parser states): %d' % batchSize) self.logger.info('Total feature count: %d' % \ (len(featureTypeInstances))) self.logger.info('Total bag of features length per state: %d' % \ (self.BAG_OF_FEATURES_LEN)) self.logger.info('Total features input size: %d' % \ (batchSize*self.BAG_OF_FEATURES_LEN)) # for actions, we don't encode UNKNOWN, ROOT, or OUTSIDE # we only encode the number of base values self.ACTION_COUNT = self.transitionSystem.numActions( self.featureMaps['label'].getDomainSize(includeSpecial=False)) self.logger.info('Total action count: %d' % self.ACTION_COUNT)