class Dataset(object): def __init__(self, train, dev, test): self.train = train self.dev = dev self.test = test # indexes will be populated by `_index`. self.Y = Alphabet() # tag set self.V = Alphabet() # vocabulary self.V_freq = Counter() # token unigram counts self.V2Y = defaultdict(set) # tag dictionary self.prefixes = Counter() self.suffixes = Counter() self._index(self.train) def _index(self, data): "frequency tables, etc." for sentence in data: for y, w in sentence: self.Y.add(y) self.V.add(w) self.V2Y[w].add(y) self.V_freq[w] += 1 for prefix in prefixes(w): self.prefixes[prefix] += 1 for suffix in suffixes(w): self.suffixes[suffix] += 1 def make_instances(self, fold, cls): "Convert tuples in data `fold` to instances of `cls`." data = [] for x in iterview(getattr(self, fold), msg='Features (%s)' % fold): tags, tokens = zip(*x) data.append(cls(tokens, self.Y.map(tags), self)) return data def tag_ngram_counts(self, n): "Returns tag ngram count for subsequences of length n." # Y = self.Y def tag_sequences(): """Iterate over tag sequence (as `str` instead of `int`, which is how they are stored.). """ for e in self.train: y, _ = zip(*e) # assert all(isinstance(yy, int) for yy in y), y # yield tuple(Y.lookup_many(y)) yield y return ngram_counts(tag_sequences(), n)
class Dictionary(object): """ Reads in the dictionary with confidence scores """ def __init__(self, fin, source_lang, target_lang): # variables self.source_lang = source_lang self.target_lang = target_lang # intern the variables self.source = Alphabet() self.target = Alphabet() self.store = {} with codecs.open(fin, encoding="utf-8") as f: for line in f: line = line.strip() source, target, score = line.split(" ") #if "Buch" in source or "Stuhl" in source: score = float(score) self.store[(source, target)] = score self.source.add(source) self.target.add(target)
def vectorize(self, segmentations, maxn=200, threshold=5): """ vectorize to features for theano """ lookup = {'11': 0, '22': 1} index = Alphabet() count = dd(int) for segmentation in segmentations: for segment in segmentation: if segment not in lookup: lookup[segment] = len(lookup) count[segment] += 1 # create vectors self.N = threshold for k, v in count.items(): if v > threshold: index.add(k) self.N += 1 seg2vec = {} for seg, i in lookup.items(): if i < 2: continue vec = zeros((self.N)) if (self.d.check(seg) or self.d.check(seg.title())) and len(seg) > 3: vec[0] = 1.0 elif len(seg) > 3: vec[1] = 1.0 if count[seg] > threshold: vec[index[seg] + 2] = 1.0 seg2vec[seg] = vec # segmentation2vec self.segmentation2vec = {} for segmentation in segmentations: f = zeros((self.N)) for segment in segmentation: f += seg2vec[segment] self.segmentation2vec[' '.join(segmentation)] = f
class StringFeatures(object): """ String features """ def __init__(self, prefix_length=0, suffix_length=4): self.prefix_length, self.suffix_length = prefix_length, suffix_length self.attributes = Alphabet() self.word2attributes = {} self.words = Alphabet() def get_attributes(self, word, extract=False): """ extract the features """ lst = [] for i in xrange(1, self.prefix_length+1): if i > len(word): break prefix = word[:i] name = "PREFIX: "+prefix if extract: self.attributes.add(name) if name in self.attributes: lst.append(self.attributes[name]) for i in xrange(1, self.suffix_length+1): if i < 0: break suffix = word[-i:] name = "SUFFIX: "+suffix if extract: self.attributes.add(name) if name in self.attributes: lst.append(self.attributes[name]) return lst def store(self, word): """ store the features """ self.words.add(word) i = self.words[word] self.word2attributes[i] = self.get_attributes(word, True) def __len__(self): return len(self.attributes) def __getitem__(self, word): if word in self.words: i = self.words[word] return self.word2attributes[i] # don't extract return self.get_attributes(word, False)
class Segmenter(object): """ Segmenter """ def __init__(self, train, dev, test, decode_type, split_num, log_fname=None, segmenter_type='tree', G=3, weights='weights', alphabet=None, T_L=2, T_eta=1.0, T_C=0.0000001, S_L=2, S_eta=1.0, S_C=0.00000001): # set up the logging system log = logging.getLogger('') log.setLevel(logging.INFO) format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.handlers.RotatingFileHandler(log_fname, maxBytes=(1048576 * 5), backupCount=7) fh.setFormatter(format) log.addHandler(fh) self.G = G self.split_num = split_num self.segmenter_type = segmenter_type self.decode_type = decode_type self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C self.sig = "split={0},L={1},C={2},eta={3},type={4}".format( *(self.split_num, self.T_L, self.T_C, self.T_eta, self.decode_type)) logging.info("Transducer Regularizer Type: L={0}".format(T_L)) logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C)) logging.info("Transducer Learning Rate: eta={0}".format(T_eta)) logging.info("Segmenter Regularizer Type: L={0}".format(S_L)) logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C)) logging.info("Segmenter Learning Rate: eta={0}".format(S_eta)) self.weights = weights self.Sigma = Alphabet() self.Sigma.add("") # add epsilon at 0 self.Sigma.add("o") self.Sigma.add("n") self.Sigma.add("y") self.Sigma.add("s") self.Sigma.add("e") if alphabet is not None: self.Sigma = self.Sigma.load(alphabet) # processs the data # TODO: modularize self.train = self.process(train, 100000) self.dev = self.process(dev, 1000) self.test = self.process(test, 1000) # dump the alphabet self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num)) # create model self.segmenter = None logging.info("Segmenter Type: {0}".format(self.segmenter_type)) logging.info("Decoder Type: {0}".format(self.decode_type)) if self.segmenter_type == TREE: self.segmenter = TreeSegmenter(G) self.features = TreeFeatures(self.G, 1000) elif self.segmenter_type == CHUNK: self.segmenter = ChunkSegmenter(G) self.features = ChunkFeatures(self.G, 1000) else: raise Exception('Illicit Model Type') # transducer self.transducer = TransducerModel(self.train, self.dev, self.test, self.Sigma, L=self.T_L, eta=self.T_eta, C=self.T_C) # extract features self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') # dimension of data self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=2, C=self.S_C, eta=0.1, fudge=1e-4) self.updater.w[0] = 10 self.updater.w[1] = 10 def save_transducer(self, directory, i): """ save the transducer weights """ np.save(directory + "/transducer-{0}-{1}.npy".format(*(self.sig, i)), array(self.transducer.updater.w)) def save_segmenter(self, directory, i): np.save(directory + "/segmenter-{0}-{1}.npy".format(*(self.sig, i)), array(self.updater.w)) def save(self, directory): self.save_transducer(directory, 'final') self.save_segmenter(directory, 'final') def optimize(self, t=None, load=False, transducer=None, segmenter=None, iterations=20): """ optimize the model """ if load: assert transducer is not None #assert segmenter is not None self.load_weights(transducer, segmenter) if t is None: return elif t == JOINT: self.optimize_joint(iterations) elif t == PIPE: self.optimize_pipeline(iterations) def load_weights(self, transducer, segmenter): """ load weights """ self.transducer.updater.w = np.load(transducer) self.updater.w = np.load(segmenter) def optimize_pipeline(self, iterations=10): """ optimize """ for i in xrange(iterations): self.transducer.optimize(1, i) train_acc = self.transducer.evaluate(self.train) dev_acc = self.transducer.evaluate(self.dev) test_acc = self.transducer.evaluate(self.test) logging.info( "transducer epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info( "transducer epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info( "transducer epoch {0} test acc: {1}".format(*(i, test_acc))) self.save_transducer(self.weights, i) print self.transducer.evaluate(self.dev) if self.segmenter_type == TREE: self.optimize_tree(iterations) elif self.segmenter_type == CHUNK: self.optimize_chunk(iterations) def optimize_chunk(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): gt0, gt, ge = self.features.potentials_catchup( tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like( ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) self.features.update(tree, dge, dgt0, dgt, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("chunk epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("chunk epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("chunk epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1))) def optimize_tree(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("tree epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("tree epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("tree epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("tree epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("tree epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("tree epoch {0} test f1: {1}".format(*(i, test_f1))) def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0): """ optimize jointly using importance sampling """ # TODO: unit test self.updater.eta = eta1 for i in xrange(iterations): samples = self.transducer.sample(self.train, num=num_samples) for tree, sample in iterview(zip(self.train, samples), colored('Pass %s' % (i + 1), 'blue')): # compute approximate partition function logZ = NINF strings, weights = [], [] for (ur, count) in sample.items(): score = self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # TODO: double check logZ = logaddexp(logZ, score) weights.append(score) strings.append(ur) #TODO: make more elegant tmp = [] for weight in weights: tmp.append(weight - logZ) # TODO: double check weights = tmp # take a tranducer weight gradient step with the importance sampling self.transducer.step_is(tree, strings, weights, eta=eta2) # take a segmenter weight gradient step with the importance sampling for ur, weight in zip(sample, weights): if self.segmenter_type == CHUNK: self.is_chunk(tree, ur, weight) elif self.segmenter_type == TREE: self.is_tree(tree, ur, weight) self.updater.step += 1 def is_chunk(self, tree, ur, weight): """ importance sampling gradient step tree """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) dgt0 *= weight dgt *= weight dge *= weight self.features.update(tree, dge, dgt0, dgt, self.updater) def is_tree(self, tree, ur, weight): """ importance sampling gradient step chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) dpsi *= weight self.features.update(tree, dpsi, self.updater) def baseline_ur(self, data): """ baseline ur """ for tree in iterview(data, colored('Updating Baseline UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.sr) def decode_ur(self, data): """ decodes the UR """ for tree in iterview(data, colored('Updating Viterbi UR', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) def oracle_ur(self, data): """ uses the oracle UR """ for tree in iterview(data, colored('Updating Oracle UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.ur_gold) def sample_ur(self, data, num_samples=1000): """ samples the UR """ samples = self.transducer.sample(data, num=num_samples) for tree, samples in iterview(zip(data, samples), colored('Sampling', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) for sample, count in samples.items(): tree.ur_samples.append(sample) def decode_chunk(self, tree, ur): """ decodes a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) best, segments, labels = self.segmenter.decode(tree, ge, gt0, gt) truth = [tree.ur[i:j] for i, j in tree.indices] guess = [tree.ur[i:j] for i, j in segments] return truth, guess def decode_tree(self, tree, ur): """ decodes a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) max_score, tree_string, max_spans = self.segmenter.argmax( tree.M, self.G, psi, tree.ur) gold_spans = set(tree.spans) guess_spans = set(tree.spans) p, r = 0.0, 0.0 for span in gold_spans: if span in guess_spans: p += 1.0 p /= len(gold_spans) for span in guess_spans: if span in gold_spans: r += 1.0 r /= len(guess_spans) f1 = (2 * p * r) / (p + r) # TODO: horrible hack segmentation = tree_string.replace("(", "").replace(")", "").replace(" ", "") for i in xrange(100): segmentation = segmentation.replace(str(i), "") segmentation = segmentation.split(":") guess = segmentation[:-1] truth = [x[0] for x in to_segmentation(tree.root)] return truth, guess, f1 def score_chunk(self, tree, ur): """ scores a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(ge, gt0, gt, M) def score_tree(self, tree, ur): """ scores a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(M, self.G, psi) def decode(self, data, data_type, decode_type=None, sep=u"#"): """ decode the chunker """ if decode_type is None: decode_type = self.decode_type if decode_type == ORACLE: self.oracle_ur(data) elif decode_type == BASELINE: self.baseline_ur(data) elif decode_type == VITERBI: self.decode_ur(data) elif decode_type == SAMPLE: self.sample_ur(data) else: raise Exception('Illicit Decode Type') ur_correct, ur_total = 0, 0 correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0 for tree in iterview(data, colored('Decoding', 'red')): max_ur, max_score = None, NINF counter = 0 for ur in tree.ur_samples: tree.update_ur(ur) counter += 1 score = 0.0 score = self.transducer.ll(tree, ur) #print #print "LL", self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) #print "SCORE", self.score_chunk(tree, ur) #print ur #raw_input() elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # take the best importance sample if score >= max_score: max_score = score max_ur = ur #print "counter", counter if max_ur == tree.ur_gold: ur_correct += 1 ur_total += 1 truth, guess, tree_f1_tmp = None, None, None if self.segmenter_type == CHUNK: truth, guess = self.decode_chunk(tree, max_ur) elif self.segmenter_type == TREE: truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur) tree_f1 += tree_f1_tmp # ACCURACY if truth == guess: correct += 1 # LEVENSHTEIN lev += Levenshtein.distance(sep.join(truth), sep.join(guess)) # F1 set1, set2 = set(guess), set(truth) p, r = 0, 0 for e in set1: if e in set2: p += 1 for e in set2: if e in set1: r += 1 p /= len(set1) r /= len(set2) if p + r > 0: f1 += 2 * p * r / (p + r) total += 1 logging.info("decoder type: {0}".format(decode_type)) logging.info("{0} ur acc: {1}".format(*(data_type, ur_correct / total))) logging.info("{0} seg acc: {1}".format(*(data_type, correct / total))) logging.info("{0} f1: {1}".format(*(data_type, f1 / total))) logging.info("{0} edit: {1}".format(*(data_type, lev / total))) if self.segmenter_type == TREE: logging.info("{0} tree f1: {1}".format(*(data_type, tree_f1 / total))) def process(self, fname, maximum=100): """ Put the string data into the data structures necessary for training and decoding the model. """ processed = [] data = Data(fname) for counter, (sr, (tree, (indices, index_labels))) in enumerate(data): if counter == maximum: break ur = to_string(tree) for s in list(sr): self.Sigma.add(s) for s in list(ur): self.Sigma.add(s) spans, labels = [], [] for node in walk(tree): spans.append((node.i, node.j)) labels.append(node.label) t = Tree(self.G, sr, ur, spans, labels, indices, index_labels, len(spans), tree) processed.append(t) return processed
class PTB(object): "Load the POS-tagged Penn Treebank." def __init__(self, base, coarse=True): self.base = base self.coarse = coarse self.Y = Alphabet() # tag set self.V, self.V_freq = Alphabet(), {} # vocabulary self.V2Y, self.Y2V = dd(set), dd(set) self.train, self.dev, self.test = [], [], [] self.prefixes, self.suffixes = {}, {} self.prefix2int, self.suffix2int = {}, {} # Read data and create standard splits according to # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art) # # train split [0,18] for sectionid in range(19): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.train.append(sentence) for y, w in sentence: self.V.add(w) self.V2Y[w].add(self.Y.lookup(y)) self.Y2V[self.Y.lookup(y)].add(w) if w not in self.V_freq: self.V_freq[w] = 0 self.V_freq[w] += 1 for prefix in self.extract_prefixes(w): if prefix not in self.prefixes: self.prefixes[prefix] = 0 self.prefixes[prefix] += 1 for suffix in self.extract_suffixes(w): if suffix not in self.suffixes: self.suffixes[suffix] = 0 self.suffixes[suffix] += 1 # dev split [19,21] for sectionid in range(19, 22): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.dev.append(sentence) # test split [22,24] for sectionid in range(22, 25): #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.test.extend(self.read_section(sectionid)) self.Y.freeze() def extract_prefixes(self, w, n=10): """ gets prefixes up to length n """ prefixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[:i] if segment not in self.prefix2int: self.prefix2int[segment] = len(self.prefix2int) prefixes.append(w[:i]) return prefixes def extract_suffixes(self, w, n=10): """ gets suffixes up to lenght n """ suffixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[-i:] if segment not in self.suffix2int: self.suffix2int[segment] = len(self.suffix2int) suffixes.append(w[-i:]) return suffixes def tag_bigrams(self): """ extract all tag bigrams """ bigram2count = {} for sentence in self.train: for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]): key = (tag1, tag2) if key not in bigram2count: bigram2count[key] = 0 bigram2count[key] += 1 return bigram2count def read_section(self, sectionid): "Read a section number `sectionid` from the PTB." root = os.path.join(self.base, str(sectionid).zfill(2)) for fname in os.listdir(root): if not fname.endswith('pos.gz'): continue with gzip.open(os.path.join(root, fname), 'rb') as f: for chunk in f.read().split('======================================'): if chunk.strip(): if self.coarse: # STUPID BIO ENCODING #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)] # Note: clean up punc reduction yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)] else: # TODO: what to do able bars in the tags? # FIND OUT AND CLEAN UP yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)] def pp(self, sentence): "Pretty print." return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
class Lexicon(object): """ Reads in the universal morpholigcal lexicon """ def __init__(self, fin, atts, vals, av, avs): # probably redundant... # but not optimizing for space so who cares self.atts, self.vals, self.av, self.avs = atts, vals, av, avs self.lexicon = dd(list) self.words = Alphabet() with codecs.open(fin, encoding="utf-8") as f: for line in f: line = line.strip() if line == "": continue word, lemma, tags = line.split(" ") self.words.add(word) tags = tags.split(",") for tag in tags: if len(tag.split("=")) != 2: print line print tag a, v = tag.split("=") self.av[a].add(v) self.atts.add(a) self.vals.add(v) self.lexicon[word].append((lemma, tags)) # get rid of default dict wrapper self.lexicon = dict(self.lexicon) self.av = dict(self.av) for a, s in self.av.items(): for v in s: self.avs.add((a, v)) def create_vectors(self): self.N = len(self.avs) self.W = zeros((len(self.lexicon), self.N)) # use Manaal's encoding (http://arxiv.org/abs/1512.05030) for w, lst in self.lexicon.items(): vec = zeros((self.N)) for l, ts in lst: for tag in ts: a, v = tag.split("=") #if a != "pos": # continue j = self.avs[(a, v)] vec[j] = 1.0 i = self.words[w] self.W[i] = vec def pp(self, word): """ pretty print the morphological tag of a word """ i = self.words[word] lst = [] for n in xrange(self.N): if self.W[i, n] > 0: lst.append("=".join(self.avs.lookup(n))) return word, ",".join(lst) def __getitem__(self, word): i = self.words[word] return self.W[i]
class TransducerModel(object): """ Transducer model """ def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001): self.train = train self.dev = dev self.test = test self.Sigma = Sigma assert self.Sigma[""] == 0 self.IL = IL self.C = C self.L = L self.eta = eta # X and Y self.X, self.Y = Alphabet(), Alphabet() self.X.add(""); self.Y.add("") for s, si in self.Sigma.items(): if si == 0: continue self.X.add(s) for s, si in self.Sigma.items(): if si == 0: continue self.Y.add(s) self.X.freeze(); self.Y.freeze() # first order (possibly extend) self.P = Alphabet() self.P.add("") for s, si in self.Sigma.items(): if si == 0: continue self.P.add(s) self.P.add("oo") self.P.add("nn") self.P.add("yy") self.P.add("ss") self.P.add("ee") self.P.freeze() # create Z self.Z = Alphabet() self.Z[""] = 0 for p, pi in self.P.items(): for o, oi in self.Y.items(): z = p+o self.Z.add(z) self.Z.freeze() # model self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL) self.features = TransducerFeatures(self.X, self.Y, self.P) self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4) self.updater.w[0] = 10.0 self.updater.w[1] = -10.0 def optimize(self, iterations=10, start=0): """ optimize the model """ #np.random.shuffle(self.train) for i in xrange(iterations): for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) dpsi = zeros_like(psi) x, y = instance.sr, instance.ur #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY) dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY) self.features.update(instance, dpsi, self.updater) self.updater.step += 1 def step_is(self, tree, strings, weights, eta=0.0): """ optimize the model """ self.updater.eta = eta psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = zeros_like(psi) dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 def sample(self, data, num=1000): """ sample """ samples = [] inside = 0 correct1, correct2, total = 0, 0, 0 for instance in iterview(data, colored('Sampling', 'green')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) sr = instance.sr dist = {} for s in self.model.sample(sr, psi, num=num): output = "" for x, y in s: output += y if output not in dist: dist[output] = 0 dist[output] += 1 count = dist[instance.ur_gold] if instance.ur_gold in dist else 0 decoded = self.decode(instance)[1] if decoded != instance.ur_gold and count > 0: inside += 1 if decoded == instance.ur_gold: correct1 += 1 if instance.ur_gold in dist: correct2 += 1 total += 1 samples.append(dist) # TODO: put into log #print ; print inside #print correct1 / total, correct2 / total return samples def decode(self, instance): """ Decodes an instance """ psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) return results def evaluate(self, data, maximum=100000000): """ decode the model """ correct, total = 0, 0 counter = 0 for instance in iterview(data, colored('Decoding', 'red')): if counter == maximum: break psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY) score, ur2 = results[0], results[1] if ur1 == ur2: correct += 1 print ur1, ur2 total += 1 counter += 1 print return float(correct) / total def ll(self, tree, ur): """ gets the log-likelihood """ psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY) score, ur2 = results[0], results[1] if ur1 == ur2: correct += 1 print ur1, ur2 total += 1 counter += 1 print return float(correct) / total def ll(self, tree, ur): """ gets the log-likelihood """ psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY) if __name__ == "__main__": data = [("hablar", "hablando"), ("comer", "comiendo")] Sigma = Alphabet() Sigma.add("") for (x, y) in data: for c in list(x): Sigma.add(c) for c in list(y): Sigma.add(c) tm = TransductionModel(Sigma, data) profile.runctx("tm.train()", locals(), globals())