def __init__(self, model, vocab=None): if vocab == None: self._shadow = Linear(len(model.weight)) else: self._shadow = Linear(len(model.weight), vocab) self.model = model self._counter = 1.0
def __init__(self, vocab_words, vocab_tags, encoded=True): self.vocab_words = vocab_words self.vocab_tags = vocab_tags if encoded: self.model = Linear(len(vocab_tags)) else: self.model = Linear(len(vocab_tags), vocab_tags)
def __init__(self, vocab_words, vocab_tags, encoded=True): self.vocab_words = vocab_words self.vocab_tags = vocab_tags self.most_frequent = {} self.pre_sufix = {} self.fallback = None if encoded: self.model = Linear(len(vocab_tags)) else: self.model = Linear(len(vocab_tags), vocab_tags)
class PerceptronTagger(Tagger): def __init__(self, vocab_words, vocab_tags, encoded=True): self.vocab_words = vocab_words self.vocab_tags = vocab_tags if encoded: self.model = Linear(len(vocab_tags)) else: self.model = Linear(len(vocab_tags), vocab_tags) def featurize(self, words, i, pred_tags): ENC_PAD = self.vocab_words[PAD] current_word = words[i] prev_word = ENC_PAD if i == 0 else words[i - 1] next_word = ENC_PAD if i == len(words) - 1 else words[i + 1] tag = ENC_PAD if i == 0 else pred_tags[i - 1] return [(0, current_word), (1, prev_word), (2, next_word), (3, tag)] def predict(self, words): pred_tags = [] for i in range(len(words)): features = self.featurize(words, i, pred_tags) output_vector = self.model.forward(features) tag = max(output_vector, key=output_vector.get) pred_tags.append(tag) return pred_tags
class GoldTagger(PerceptronTagger): def __init__(self, vocab_words, vocab_tags, encoded = True): self.vocab_words = vocab_words self.vocab_tags = vocab_tags self.most_frequent = {} self.pre_sufix = {} self.fallback = None if encoded: self.model = Linear(len(vocab_tags)) else: self.model = Linear(len(vocab_tags), vocab_tags) def featurize(self, words, i, pred_tags, next_tag): ENC_PAD = self.vocab_words[PAD] features = [] features.append((0,words[i])) features.append((1,words[i-1]) if i != 0 else ENC_PAD ) features.append((2 ,words[i+1] if i + 1 < len(words) else ENC_PAD)) features.append((3 ,pred_tags[i-1] if i != 0 else ENC_PAD)) features.append((4 ,(pred_tags[i-2] if i > 1 else ENC_PAD, pred_tags[i-1] if i != 0 else ENC_PAD))) features.append((5, self.pre_sufix[words[i][:1] ] if words[i][:1] in self.pre_sufix else ENC_PAD)) features.append((6, self.pre_sufix[words[i][:2] ] if words[i][:2] in self.pre_sufix else ENC_PAD)) features.append((7, self.pre_sufix[words[i][:3] ] if words[i][:3] in self.pre_sufix else ENC_PAD)) features.append((8, self.pre_sufix[words[i][:4] ] if words[i][:4] in self.pre_sufix else ENC_PAD)) features.append((9, self.pre_sufix[words[i][-1:]] if words[i][-1:] in self.pre_sufix else ENC_PAD)) features.append((10, self.pre_sufix[words[i][-2:]] if words[i][-2:] in self.pre_sufix else ENC_PAD)) features.append((11, self.pre_sufix[words[i][-3:]] if words[i][-3:] in self.pre_sufix else ENC_PAD)) features.append((12, self.pre_sufix[words[i][-4:]] if words[i][-4:] in self.pre_sufix else ENC_PAD)) #Most frequent tag for next word features.append((13, next_tag)) features.append((14, 1 if bool(re.search(r'\w*-\w*', words[i])) else ENC_PAD)) features.append((15, 1 if bool(re.search(r'\d', words[i])) else ENC_PAD)) return features def predict(self, words, tags): pred_tags = [] for i in range(len(words)): features = self.featurize(words, i, pred_tags, tags[i+1] if i + 1 < len(tags) else PAD) output_vector = self.model.forward(features) tag = max(output_vector, key=output_vector.get) pred_tags.append(tag) return pred_tags
class PerceptronParser(Parser): def __init__(self, vocab_words, vocab_tags): self.vocab_words = vocab_words self.vocab_tags = vocab_tags self.model = Linear(len(vocab_tags)) def __calc_dist(self, dist): if dist < 6: return 1 if 6 <= dist <= 11: return 2 return 3 def featurize(self, words, tags, config): ENC_PAD = self.vocab_words[PAD] i = config[0] stack = config[1] heads = config[2] w_next = ENC_PAD if i == len(words) else words[i] w_top = ENC_PAD if len(stack) == 0 else words[stack[-1]] w_sec = ENC_PAD if len(stack) < 2 else words[stack[-2]] t_next = ENC_PAD if i == len(words) else tags[i] t_top = ENC_PAD if len(stack) == 0 else tags[stack[-1]] t_sec = ENC_PAD if len(stack) < 2 else tags[stack[-2]] return [(0, w_next), (1, w_top), (2, w_sec), (3, t_next), (4, t_top), (5, t_sec)] def predict(self, words, tags): parser = Parser() # 1. Start in the initial configuration for the input sentence. config = parser.initial_config(len(words)) # 2. As long as there are valid moves, ask the averaged perceptron for the next move to take. while len(self.valid_moves(config)) != 0: features = self.featurize(words, tags, config) output_vector = self.model.forward(features) move = max(output_vector, key=output_vector.get) config = self.next_config(config, move) # 3. Return the list of heads associated with the final configuration. return config[2]
def __init__(self, vocab_words, vocab_tags): self.vocab_words = vocab_words self.vocab_tags = vocab_tags self.model = Linear(len(vocab_tags))