def get_features(self, context_obj): if 'target_pos' not in context_obj: raise NoDataError('target_pos', context_obj, 'POSContextFeatureExtractor') if 'source_pos' not in context_obj: raise NoDataError('source_pos', context_obj, 'POSContextFeatureExtractor') left_src = left_context( context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][0]], context_size=1, idx=context_obj['source_index'][0]) right_src = right_context( context_obj['source_pos'], context_obj['source_pos'][context_obj['source_index'][1] - 1], context_size=1, idx=context_obj['source_index'][1] - 1) left_tg = left_context( context_obj['target_pos'], context_obj['target_pos'][context_obj['index'][0]], context_size=1, idx=context_obj['index'][0]) return [left_src[0], right_src[0], left_tg[0]]
def get_features(self, context_obj): if 'target_pos' not in context_obj: if 'target' in context_obj and context_obj['target'] is not None: context_obj['target_pos'] = self._call_tagger(context_obj['target']) else: raise NoDataError('target_pos', context_obj, 'POSFeatureExtractor') if 'source_pos' not in context_obj: if 'source' in context_obj and context_obj['source'] is not None: context_obj['source_pos'] = self._call_tagger(context_obj['source'], lang='src') else: raise NoDataError('source_pos', context_obj, 'POSFeatureExtractor') # extract POS features: # - target POS # - source POS (may be more than 1) # - something else? tg_pos = context_obj['target_pos'][context_obj['index']] if context_obj['target_pos'] != [] else '' src_pos = [] if 'source_pos' in context_obj and context_obj['source_pos'] != [] and 'alignments' in context_obj: align_idx = context_obj['alignments'][context_obj['index']] if align_idx is not None: src_pos = context_obj['source_pos'][align_idx] else: src_pos = '__unaligned__' return [tg_pos, src_pos]
def get_features(self, context_obj): #sys.stderr.write("Start PhraseAlignmentFeatureExtractor\n") if 'source' not in context_obj or context_obj['source'] is None: #sys.stderr.write('No source') raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor') if 'target' not in context_obj or context_obj[ 'source'] is None or context_obj['target'] is None: #sys.stderr.write('No target') raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor') if 'alignments_all' not in context_obj: context_obj['alignments_all'] = [[i] for i in context_obj['alignments'] ] #raise NoDataError('alignments_all', context_obj, 'AlignmentFeatureExtractor') # if self.model == '': # raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # we have to extract new alignments because we need the number of aligned words per target word # local_alignments = align_sentence(context_obj['source'], context_obj['target'], self.model) n_unaligned, n_multiple = 0, 0 n_alignments = [] #sys.stderr.write('All fine\n') #sys.stderr.write('%s\n' % (', '.join([s for s in context_obj]))) #sys.stderr.write('%s, %i\n' % (type(context_obj['index']), len(context_obj['index']))) #sys.stderr.write('Context obj index: %i to %i\n' % (context_obj['index'][0], context_obj['index'][1])) for i in range(context_obj['index'][0], context_obj['index'][1]): assert (all([ w == ww for (w, ww) in zip(context_obj['token'], [ context_obj['target'][j] for j in range( context_obj['index'][0], context_obj['index'][1]) ]) ])), "Assertion failed" #sys.stderr.write('Assertion was fine\n') #print(context_obj['alignments_all']) cur_alignments = len(context_obj['alignments_all'][i]) #sys.stderr.write('Alignments_all\n') if cur_alignments == 0: #sys.stderr.write('Cur_alignments = 0\n') n_unaligned += 1 elif cur_alignments > 1: #sys.stderr.write('Cur_alignments > 1\n') n_multiple += 1 #sys.stderr.write('Op!\n') n_alignments.append(cur_alignments) #sys.stderr.write('Still fine') tg_len = len(context_obj['token']) #sys.stderr.write("Finish PhraseAlignmentFeatureExtractor\n") return [ str(n_unaligned / tg_len), str(n_multiple / tg_len), str(np.average(n_alignments)) ]
def get_features(self, context_obj): if 'source' not in context_obj or context_obj['source'] is None: raise NoDataError('source', context_obj, 'AlignmentFeatureExtractor') if 'target' not in context_obj or context_obj[ 'source'] is None or context_obj['target'] is None: raise NoDataError('target', context_obj, 'AlignmentFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # if self.model == '': # raise NoDataError('alignments', context_obj, 'AlignmentFeatureExtractor') # context_obj['alignments'] = align_sentence(context_obj['source'], context_obj['target'], self.model) # source word(s) try: align_idx = context_obj['alignments'][context_obj['index']] except IndexError: print("{} items in the alignment, needed {}-th".format( len(context_obj['alignments']), context_obj['index'])) print(context_obj['alignments'], context_obj['target'], context_obj['source']) sys.exit() # if word is unaligned - no source and no source contexts if align_idx == None: return [ '__unaligned__', '|'.join(['__unaligned__' for i in range(self.context_size)]), '|'.join(['__unaligned__' for i in range(self.context_size)]) ] # TODO: find contexts for all words aligned to the token (now only 1st word) else: left = '|'.join( left_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) right = '|'.join( right_context(context_obj['source'], context_obj['source'][align_idx], context_size=self.context_size, idx=align_idx)) aligned_to = context_obj['source'][align_idx] return [aligned_to, left, right]
def get_features(self, context_obj): if 'sentence_id' not in context_obj: raise NoDataError('sentence_id', context_obj, 'PseudoReferenceFeatureExtractor') out = 1 if context_obj['token'] in self.pseudo_references[ context_obj['sentence_id']] else 0 return [out]
def get_features(self, context_obj): if 'sequence_tags' not in context_obj: raise NoDataError('sequence_tags', context_obj, 'PreviousTagFeatureExtractor') idx = context_obj['index'] if idx == 0: return ['_START_'] else: return [context_obj['sequence_tags'][idx-1]]
def get_features(self, context_obj): if 'source' not in context_obj: raise NoDataError('source', context_obj, 'SourceLMFeatureExtractor') if 'alignments' not in context_obj: raise NoDataError('alignments', context_obj, 'SourceLMFeatureExtractor') align_idx = context_obj['alignments'][context_obj['index']] # unaligned if align_idx is None: return [0, 0] align_token = context_obj['source'][align_idx] left_ngram = left_context( context_obj['source'], align_token, context_size=2, idx=align_idx) + [align_token] right_ngram = [align_token] + right_context( context_obj['source'], align_token, context_size=2, idx=align_idx) left_ngram_order = self.check_lm(left_ngram, side='left') right_ngram_order = self.check_lm(right_ngram, side='right') return [left_ngram_order, right_ngram_order]
def get_features(self, context_obj): if 'source' not in context_obj: raise NoDataError('source', context_obj, 'GoogleTranslateFeatureExtractor') if 'pseudo-reference' in context_obj: translation = context_obj['pseudo-reference'] else: gs = Goslate() translation = word_tokenize( gs.translate(' '.join(context_obj['source']), self.lang)) if context_obj['token'] in translation: return [1] return [0]