def test_extract_window(self): sen_str = 'this is a test sentence.' sen = word_tokenize(sen_str.lower()) test_token = 'this' window = ngram_window_extractor.extract_window(sen, test_token) self.assertListEqual(window, ['_START_', 'this', 'is'], 'A window starting with the first token should be correct') sen2_str = 'this is a test sentence.' sen2 = word_tokenize(sen2_str.lower()) test_token2 = 'is' window2 = ngram_window_extractor.extract_window(sen2, test_token2) self.assertListEqual(window2, ['this', 'is', 'a'], 'A window starting with the second token should be correct')
def get_features(self, context_obj): idx = context_obj['index'] left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) + [context_obj['token']] right_ngram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']] middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx) right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) backoff_left = self.get_backoff(left_trigram) backoff_middle = self.get_backoff(middle_trigram) backoff_right = self.get_backoff(right_trigram) return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right]
def get_features(self, context_obj): idx = context_obj['index'] left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order - 1, idx=idx) + [context_obj['token']] right_ngram = [context_obj['token']] + right_context( context_obj['target'], context_obj['token'], context_size=self.order - 1, idx=idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']] middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx) right_trigram = [context_obj['token']] + right_context( context_obj['target'], context_obj['token'], context_size=2, idx=idx) # TODO: instead of _START_ there should be <s> backoff_left = self.get_backoff(left_trigram) backoff_middle = self.get_backoff(middle_trigram) backoff_right = self.get_backoff(right_trigram) return [ left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right ]