def get_features(self, context_obj): idx_left = context_obj['index'][0] idx_right = context_obj['index'][1] left_ngram = left_context(context_obj['target'], context_obj['token'][0], context_size=self.order - 1, idx=idx_left) + [context_obj['token'][0]] right_ngram = [context_obj['token'][-1]] + right_context( context_obj['target'], context_obj['token'][-1], context_size=self.order - 1, idx=idx_right) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'][0], context_size=2, idx=idx_left) + [context_obj['token'][0]] right_trigram = [context_obj['token'][-1]] + right_context( context_obj['target'], context_obj['token'][-1], context_size=2, idx=idx_right) backoff_left = self.get_backoff(left_trigram) backoff_right = self.get_backoff(right_trigram) return [ left_ngram_order, right_ngram_order, backoff_left, backoff_right ]
def get_features(self, context_obj): if 'source_token' in context_obj and len( context_obj['source_token']) > 0 and len( context_obj['source_index']) > 1: try: left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0]) except IndexError: print(context_obj['source']) print(context_obj['source_token']) print(context_obj['source_index']) sys.exit() right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1] - 1) else: left_src = "" right_src = "" left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0]) right_tg = right_context(context_obj['target'], context_obj['token'][-1], context_size=1, idx=context_obj['index'][1] - 1) return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
def get_features(self, context_obj): if 'source_token' in context_obj: left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0]) right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1) else: left_src = "" right_src = "" left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0]) right_tg = right_context(context_obj['target'], context_obj['token'][-1], context_size=1, idx=context_obj['index'][1]-1) return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
def get_features(self, context_obj): if 'target_pos' not in context_obj: raise NoDataError('target_pos', context_obj, 'POSContextFeatureExtractor') if 'source_pos' not in context_obj: raise NoDataError('source_pos', context_obj, 'POSContextFeatureExtractor') left_src = left_context(context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][0]], context_size=1, idx=context_obj['source_index'][0]) right_src = right_context(context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][1]-1], context_size=1, idx=context_obj['source_index'][1]-1) left_tg = left_context(context_obj['target_pos'], context_obj['target_pos'][context_obj['index'][0]], context_size=1, idx=context_obj['index'][0]) right_tg = right_context(context_obj['target_pos'], context_obj['target_pos'][context_obj['index'][1]-1], context_size=1, idx=context_obj['index'][1]-1) return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
def get_features(self, context_obj): idx_left = context_obj['index'][0] idx_right = context_obj['index'][1] left_ngram = left_context(context_obj['target'], context_obj['token'][0], context_size=self.order-1, idx=idx_left) + [context_obj['token'][0]] right_ngram = [context_obj['token'][-1]] + right_context(context_obj['target'], context_obj['token'][-1], context_size=self.order-1, idx=idx_right) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'][0], context_size=2, idx=idx_left) + [context_obj['token'][0]] right_trigram = [context_obj['token'][-1]] + right_context(context_obj['target'], context_obj['token'][-1], context_size=2, idx=idx_right) backoff_left = self.get_backoff(left_trigram) backoff_right = self.get_backoff(right_trigram) return [left_ngram_order, right_ngram_order, backoff_left, backoff_right]
def get_features(self, context_obj): if 'source' not in context_obj: raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor') align = sorted(context_obj['alignments'][context_obj['index']]) # unaligned if align == []: return [0, 0] idx_first = align[0] idx_last = align[-1] words_number = idx_last - idx_first tokens = context_obj['source'][idx_first:idx_last+1] left_ngram = left_context(context_obj['source'], tokens[0], context_size=self.order-1-words_number, idx=idx_first) + tokens right_ngram = tokens + right_context(context_obj['source'], tokens[-1], context_size=self.order-1-words_number, idx=idx_last) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') # left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']] # middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx) # right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) # # backoff_left = self.get_backoff(left_trigram) # backoff_middle = self.get_backoff(middle_trigram) # backoff_right = self.get_backoff(right_trigram) # return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right] return [left_ngram_order, right_ngram_order]
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join( left_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) right = ' '.join( right_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) tg_pos = context_obj['target_pos'][ context_obj['index']] if context_obj['target_pos'] != [] else '' align_idx = context_obj['alignments'][context_obj['index']] if align_idx is None: src_token = '__unaligned__' src_pos = '__unaligned__' else: src_token = context_obj['source'][align_idx] src_pos = context_obj['source_pos'][align_idx] return [ token + '|' + left, token + '|' + right, token + '|' + src_token, tg_pos + '|' + src_pos ]
def get_features(self, context_obj): if 'source' not in context_obj or context_obj['source'] is None: raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor') if 'target' not in context_obj or context_obj['source'] is None or context_obj['target'] is None: raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # if self.model == '': # raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model) # source word(s) try: align_idx = context_obj['alignments'][context_obj['index']] except IndexError: print("{} items in the alignment, needed {}-th".format(len(context_obj['alignments']), context_obj['index'])) print(context_obj['alignments'], context_obj['target'], context_obj['source']) sys.exit() # if word is unaligned - no source and no source contexts if align_idx == None: return ['__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)])] # TODO: find contexts for all words aligned to the token (now only 1st word) else: left = '|'.join(left_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) right = '|'.join(right_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) aligned_to = context_obj['source'][align_idx] return [aligned_to, left, right]
def get_features(self, context_obj): if 'source_token' in context_obj and len(context_obj['source_token']) > 0 and len(context_obj['source_index']) > 1: try: left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0]) except IndexError: print(context_obj['source']) print(context_obj['source_token']) print(context_obj['source_index']) sys.exit() right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1) else: left_src = "" right_src = "" left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0]) right_tg = right_context(context_obj['target'], context_obj['token'][-1], context_size=1, idx=context_obj['index'][1]-1) return [left_src[0], right_src[0], left_tg[0], right_tg[0]]
def get_features(self, context_obj): idx = context_obj['index'] left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) + [context_obj['token']] right_ngram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=self.order-1, idx=idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']] middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx) right_trigram = [context_obj['token']] + right_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) backoff_left = self.get_backoff(left_trigram) backoff_middle = self.get_backoff(middle_trigram) backoff_right = self.get_backoff(right_trigram) return [left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right]
def get_features(self, context_obj): #sys.stderr.write("Start ContextLeftFeatureExtractor\n") if 'source_token' in context_obj: left_src = left_context(context_obj['source'], context_obj['source_token'][0], context_size=1, idx=context_obj['source_index'][0]) right_src = right_context(context_obj['source'], context_obj['source_token'][-1], context_size=1, idx=context_obj['source_index'][1]-1) else: left_src = "" right_src = "" left_tg = left_context(context_obj['target'], context_obj['token'][0], context_size=1, idx=context_obj['index'][0]) #sys.stderr.write("Finish ContextLeftFeatureExtractor\n") return [left_src[0], right_src[0], left_tg[0]]
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join(left_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) right = ' '.join(right_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) align_idx = context_obj['alignments'][context_obj['index']] if align_idx is None: aligned_to = '__unaligned__' else: aligned_to = context_obj['source'][align_idx] return [token + '|' + left + '|' + aligned_to, token + '|' + right + '|' + aligned_to]
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join( left_context(context_obj['target'], token, context_size=self.context_size, idx=context_obj['index'])) right = ' '.join( right_context(context_obj['target'], token, context_size=self.context_size, idx=context_obj['index'])) return [token, left, right]
def get_features(self, context_obj): idx = context_obj['index'] left_ngram = left_context(context_obj['target'], context_obj['token'], context_size=self.order - 1, idx=idx) + [context_obj['token']] right_ngram = [context_obj['token']] + right_context( context_obj['target'], context_obj['token'], context_size=self.order - 1, idx=idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') left_trigram = left_context(context_obj['target'], context_obj['token'], context_size=2, idx=idx) + [context_obj['token']] middle_trigram = extract_window(context_obj['target'], context_obj['token'], idx=idx) right_trigram = [context_obj['token']] + right_context( context_obj['target'], context_obj['token'], context_size=2, idx=idx) # TODO: instead of _START_ there should be <s> backoff_left = self.get_backoff(left_trigram) backoff_middle = self.get_backoff(middle_trigram) backoff_right = self.get_backoff(right_trigram) return [ left_ngram_order, right_ngram_order, backoff_left, backoff_middle, backoff_right ]
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join(left_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) right = ' '.join(right_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) tg_pos = context_obj['target_pos'][context_obj['index']] if context_obj['target_pos'] != [] else '' align_idx = context_obj['alignments'][context_obj['index']] if align_idx is None: src_token = '__unaligned__' src_pos = '__unaligned__' else: src_token = context_obj['source'][align_idx] src_pos = context_obj['source_pos'][align_idx] return [token + '|' + left, token + '|' + right, token + '|' + src_token, tg_pos + '|' + src_pos]
def get_features(self, context_obj): if 'source' not in context_obj or context_obj['source'] is None: raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor') if 'target' not in context_obj or context_obj[ 'source'] is None or context_obj['target'] is None: raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # if self.model == '': # raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model) # source word(s) try: align_idx = context_obj['alignments'][context_obj['index']] except IndexError: print("{} items in the alignment, needed {}-th".format( len(context_obj['alignments']), context_obj['index'])) print(context_obj['alignments'], context_obj['target'], context_obj['source']) sys.exit() # if word is unaligned - no source and no source contexts if align_idx == None: return [ '__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)]) ] # TODO: find contexts for all words aligned to the token (now only 1st word) else: left = '|'.join( left_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) right = '|'.join( right_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) aligned_to = context_obj['source'][align_idx] return [aligned_to, left, right]
def get_features(self, context_obj): if 'source' not in context_obj: raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor') align_idx = context_obj['alignments'][context_obj['index']] # unaligned if align_idx is None: return [0, 0] align_token = context_obj['source'][align_idx] left_ngram = left_context(context_obj['source'], align_token, context_size=2, idx=align_idx) + [align_token] right_ngram = [align_token] + right_context(context_obj['source'], align_token, context_size=2, idx=align_idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') return [left_ngram_order, right_ngram_order]
def get_features(self, context_obj): if 'source' not in context_obj: raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor') align_idx = context_obj['alignments'][context_obj['index']] # unaligned if align_idx is None: return [0, 0] align_token = context_obj['source'][align_idx] left_ngram = left_context( context_obj['source'], align_token, context_size=2, idx=align_idx) + [align_token] right_ngram = [align_token] + right_context( context_obj['source'], align_token, context_size=2, idx=align_idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') return [left_ngram_order, right_ngram_order]
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join( left_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) right = ' '.join( right_context(context_obj['target'], token, context_size=1, idx=context_obj['index'])) align_idx = context_obj['alignments'][context_obj['index']] if align_idx is None: aligned_to = '__unaligned__' else: aligned_to = context_obj['source'][align_idx] return [ token + '|' + left + '|' + aligned_to, token + '|' + right + '|' + aligned_to ]
def get_features(self, context_obj): if 'source' not in context_obj or context_obj['source'] is None: raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor') if 'target' not in context_obj or context_obj['source'] is None or context_obj['target'] is None: raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor') if 'alignments' not in context_obj: if self.model == '': raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model) # source word(s) source_nums = sorted(context_obj['alignments'][context_obj['index']]) # if word is unaligned - no source and no source contexts if source_nums == []: return ['__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)])] # TODO: find contexts for all words aligned to the token (now only 1st word) else: left = '|'.join(left_context(context_obj['source'], context_obj['source'][source_nums[0]], context_size=self.context_size, idx=source_nums[0])) right = '|'.join(right_context(context_obj['source'], context_obj['source'][source_nums[-1]], context_size=self.context_size, idx=source_nums[-1])) aligned_to = '|'.join([context_obj['source'][i] for i in source_nums]) return [aligned_to, left, right]
def test_right_context(self): sen_str = 'this is a test sentence.' sen = word_tokenize(sen_str.lower()) test_token = 'sentence' right_context = ngram_window_extractor.right_context(sen, test_token, context_size=3) self.assertListEqual(right_context, ['.', '_END_', '_END_'], 'right_context should append _END_ tokens')
def get_features(self, context_obj): token = context_obj['token'] left = ' '.join(left_context(context_obj['target'], token, context_size=self.context_size, idx=context_obj['index'])) right = ' '.join(right_context(context_obj['target'], token, context_size=self.context_size, idx=context_obj['index'])) return [token, left, right]