def test_learning_from_denotation(self): arithmetic_grammar = Grammar(self.arithmetic_rules) arithmetic_examples = self.two_parse_examples + self.one_parse_examples from executor import Executor arithmetic_model = Model( grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=defaultdict(float), # Initialize with all weights at zero executor=Executor.execute) # Train based on correct/incorrect denotation from metrics import DenotationAccuracyMetric b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test( train_examples=arithmetic_examples[:13], test_examples=arithmetic_examples[13:], training_metric=DenotationAccuracyMetric(), seed=1) # BEFORE SGD self.assertEqual(b_trn['semantics accuracy'], 10) self.assertEqual(b_tst['denotation accuracy'], 4) # AFTER SGD self.assertEqual(a_trn['semantics accuracy'], 12) # Improvement self.assertEqual(a_trn['denotation accuracy'], 13) # Improvement
def test_learning_from_many_denotations(self): """ Large number of examples are used for training. Last 4 arithmetic_examples are used for testing. b_trn: performance metrics on training set before training a_trn: performance metrics on training set after training denotation accuracy: # of examples where denotation of parse at position 0 was correct """ arithmetic_grammar = Grammar(self.arithmetic_rules) arithmetic_examples = self.two_parse_examples + self.one_parse_examples from executor import Executor arithmetic_model = Model( grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=defaultdict(float), # Initialize with all weights at zero executor=Executor.execute) from metrics import DenotationAccuracyMetric from arithmetic import arithmetic_dev_examples b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test( train_examples=arithmetic_dev_examples, test_examples=arithmetic_examples[13:], training_metric=DenotationAccuracyMetric(), seed=1) # BEFORE SGD self.assertEqual(b_trn['denotation accuracy'], 64) # AFTER SGD self.assertEqual(a_trn['denotation accuracy'], 92) # Improvement
def test_feature_function(self): from experiment import evaluate_model from metrics import denotation_match_metrics from scoring import Model from geo880 import geo880_train_examples rules = (self.rules_optionals + self.rules_collection_entity + self.rules_types + self.rules_relations + self.rules_intersection + self.rules_superlatives + self.rules_reverse_joins) grammar = Unit3Grammar(rules=rules, annotators=self.annotators) def empty_denotation_feature(parse): features = defaultdict(float) if parse.denotation == (): features['empty_denotation'] += 1.0 return features weights = {'empty_denotation': -1.0} model = Model(grammar=grammar, feature_fn=empty_denotation_feature, weights=weights, executor=self.geobase.executor().execute) metric_values = evaluate_model(model=model, examples=geo880_train_examples, metrics=denotation_match_metrics(), print_examples=False) self.assertEqual(235, metric_values['denotation accuracy'])
def test_evaluate_grammar_with_reverse_joins(self): from experiment import sample_wins_and_losses from geoquery import GeoQueryDomain from metrics import DenotationOracleAccuracyMetric from scoring import Model rules = (self.rules_optionals + self.rules_collection_entity + self.rules_types + self.rules_relations + self.rules_intersection + self.rules_superlatives + self.rules_reverse_joins) grammar = Unit3Grammar(rules=rules, annotators=self.annotators) model = Model(grammar=grammar, executor=self.geobase.executor().execute) metric = DenotationOracleAccuracyMetric() # If printing=True, prints a sampling of wins (correct semantics in # first parse) and losses on the dataset. metric_values = sample_wins_and_losses(domain=self.domain, model=model, metric=metric, seed=1, printing=False) self.assertEqual(11562, metric_values['number of parses']) self.assertEqual(152, metric_values['denotation accuracy'])
def test_evaluation_with_scoring(self): """ Evaluate the grammar on all examples, collecting metrics: semantics oracle accuracy: # of examples where one parse or the other was correct. semantics accuracy: # of examples where parse at position 0 was correct. """ arithmetic_grammar = Grammar(self.arithmetic_rules) from executor import Executor arithmetic_model = Model(grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=self.weights, executor=Executor.execute) from experiment import evaluate_model metrics = evaluate_model(model=arithmetic_model, examples=self.one_parse_examples + self.two_parse_examples) self.assertEqual(metrics['semantics oracle accuracy'], 17) self.assertEqual(metrics['semantics accuracy'], 16) # Improvement
def test_learning_from_semantics(self): """ First 13 examples are used for training. Last 4 examples are used for testing. b_trn: performance metrics on training set before training b_tst: performance metrics on test set before training a_trn: performance metrics on training set after training a_tst: performance metrics on test set after training semantics accuracy: # of examples where parse at position 0 was correct. denotation accuracy: # of examples where denotation of parse at position 0 was correct """ arithmetic_grammar = Grammar(self.arithmetic_rules) arithmetic_examples = self.two_parse_examples + self.one_parse_examples from executor import Executor arithmetic_model = Model( grammar=arithmetic_grammar, feature_fn=Parse.operator_precedence_features, weights=defaultdict(float), # Initialize with all weights at zero executor=Executor.execute) # Train based on correct/incorrect semantics from metrics import SemanticsAccuracyMetric b_trn, b_tst, a_trn, a_tst = arithmetic_model.train_test( train_examples=arithmetic_examples[:13], test_examples=arithmetic_examples[13:], training_metric=SemanticsAccuracyMetric(), seed=1) # BEFORE SGD self.assertEqual(b_trn['semantics accuracy'], 10) self.assertEqual(b_trn['denotation accuracy'], 11) self.assertEqual(b_tst['semantics accuracy'], 4) self.assertEqual(b_tst['denotation accuracy'], 4) # AFTER SGD self.assertEqual(a_trn['semantics accuracy'], 13) # Improvement self.assertEqual(a_trn['denotation accuracy'], 13) # Improvement self.assertEqual(a_tst['semantics accuracy'], 4) self.assertEqual(a_tst['denotation accuracy'], 4)
def evaluate_grammar(grammar=None, executor=None, examples=[], examples_label=None, metrics=standard_metrics(), print_examples=True): return evaluate_model(model=Model(grammar=grammar, executor=executor), examples=examples, metrics=metrics, print_examples=print_examples)
def evaluate(self, executor=None, examples=[], examples_label=None, metrics=standard_metrics(), print_examples=False): return Model(grammar=self, executor=executor).evaluate(examples=examples, metrics=metrics, print_examples=print_examples)
def special_geo_evaluate(grammar=None, feature_fn=geo_domain.features): # Build the model by hand so that we can see all the pieces: geo_mod = Model(grammar=grammar, feature_fn=feature_fn, weights=geo_domain.weights(), executor=geo_domain.execute) # This can be done with less fuss using experiment.train_test_for_domain, # but we want full access to the model, metrics, etc. train_test(model=geo_mod, train_examples=geo_domain.train_examples(), test_examples=geo_domain.test_examples(), metrics=geo_domain.metrics(), training_metric=geo_domain.training_metric(), seed=0, print_examples=False)
def test_training_data4(self): from experiment import sample_wins_and_losses from metrics import SemanticsOracleAccuracyMetric from scoring import Model from travel import TravelDomain from geonames import GeoNamesAnnotator domain = TravelDomain() rules = self.rules_travel + self.rules_travel_locations + self.rules_travel_modes + self.rules_travel_triggers + self.rules_request_types + self.rules_optionals grammar = Unit2Grammar(rules=rules, annotators=[GeoNamesAnnotator(live_requests=False)]) model = Model(grammar=grammar) metric = SemanticsOracleAccuracyMetric() # If printing=True, prints a sampling of wins (correct semantics in # first parse) and losses on the dataset. metric_values = sample_wins_and_losses(domain=domain, model=model, metric=metric, seed=31, printing=False)
def test_evaluate_simple_grammar(self): from experiment import sample_wins_and_losses from metrics import DenotationOracleAccuracyMetric from scoring import Model rules = self.rules_optionals + self.rules_collection_entity grammar = Unit2Grammar(rules=rules, annotators=self.annotators) model = Model(grammar=grammar, executor=self.geobase.executor().execute) metric = DenotationOracleAccuracyMetric() # If printing=True, prints a sampling of wins (correct semantics in # first parse) and losses on the dataset. metric_values = sample_wins_and_losses(domain=self.domain, model=model, metric=metric, seed=1, printing=False) self.assertEqual(17, metric_values['number of parses'])
def test_evaluate_model(self): from experiment import evaluate_model from metrics import denotation_match_metrics from scoring import Model from geo880 import geo880_train_examples rules = (self.rules_optionals + self.rules_collection_entity + self.rules_types + self.rules_relations + self.rules_intersection + self.rules_superlatives + self.rules_reverse_joins) grammar = Unit3Grammar(rules=rules, annotators=self.annotators) model = Model(grammar=grammar, executor=self.geobase.executor().execute) # Set print_examples=True and look for 'what state has the shortest # river?' and evaluate_model(model=model, examples=geo880_train_examples[:10], metrics=denotation_match_metrics(), print_examples=False)
def learn_lexical_semantics(domain, seed=None): from parsing import Grammar print '#' * 80 print 'Learn lexical semantics experiment for domain: %s\n' % domain.__class__.__name__ original_grammar = domain.grammar() expanded_rules = cartesian_product_of_lexical_rules(domain.rules()) grammar = Grammar(rules=expanded_rules, annotators=original_grammar.annotators, start_symbol=original_grammar.start_symbol) model = Model(grammar=grammar, feature_fn=domain.features, weights=domain.weights, executor=domain.execute) train_test(model=model, train_examples=domain.train_examples(), test_examples=domain.test_examples(), metrics=domain.metrics(), training_metric=domain.training_metric(), seed=seed, print_examples=False)
def clone_model(model): return Model( grammar=model.grammar, feature_fn=model.feature_fn, weights=defaultdict(float), # Zero the weights. executor=model.executor)
def model(self): return Model(grammar=self.grammar(), feature_fn=self.features, weights=self.weights(), executor=self.execute)
travel_domain = TravelDomain() travel_grammar = travel_domain.grammar() def basic_feature_function(parse): """Features for the rule used for the root node and its children""" features = defaultdict(float) features[str(parse.rule)] += 1.0 for child in parse.children: features[str(child.rule)] += 1.0 return features # This code evaluates the current grammar: train_test(model=Model(grammar=travel_grammar, feature_fn=basic_feature_function), train_examples=travel_train_examples, test_examples=travel_test_examples, print_examples=False) # ### Question 4 # # With the default travel grammar, many of the errors on training examples occur because the origin # isn't marked by "from". You might have noticed that "directions New York to Philadelphia" # is not handled properly in our opening example. Other examples include # "transatlantic cruise southampton to tampa", # "fly boston to myrtle beach spirit airlines", and # "distance usa to peru". __Your tasks__: (i) extend the grammar with a single rule to handle examples # like these, and run another evaluation using this expanded grammar (submit your completion # of the following starter code); (ii) in 1–2 sentences, # summarize what happened to the post-training performance metrics when this rule was added.