def __init__(self, base, coarse=True): self.base = base self.coarse = coarse self.Y = Alphabet() # tag set self.V, self.V_freq = Alphabet(), {} # vocabulary self.V2Y, self.Y2V = dd(set), dd(set) self.train, self.dev, self.test = [], [], [] self.prefixes, self.suffixes = {}, {} self.prefix2int, self.suffix2int = {}, {} # Read data and create standard splits according to # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art) # # train split [0,18] for sectionid in xrange(19): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.train.append(sentence) for y, w in sentence: self.V.add(w) self.V2Y[w].add(self.Y.lookup(y)) self.Y2V[self.Y.lookup(y)].add(w) if w not in self.V_freq: self.V_freq[w] = 0 self.V_freq[w] += 1 for prefix in self.extract_prefixes(w): if prefix not in self.prefixes: self.prefixes[prefix] = 0 self.prefixes[prefix] += 1 for suffix in self.extract_suffixes(w): if suffix not in self.suffixes: self.suffixes[suffix] = 0 self.suffixes[suffix] += 1 # dev split [19,21] for sectionid in xrange(19, 22): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.dev.append(sentence) # test split [22,24] for sectionid in xrange(22, 25): #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.test.extend(self.read_section(sectionid)) self.Y.freeze()
def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
def __init__(self, filename): self.Y = Alphabet() data = list( fromSGML(filename, linegrouper="<NEW.*?>", bioencoding=False)) np.random.shuffle(data) super(CoraCitations, self).__init__(train=data[len(data) // 5:], dev=data[:len(data) // 5], test=[]) self.train = self.make_instances('train', Instance) self.dev = self.make_instances('dev', Instance)
class Dataset(object): def __init__(self, train, dev, test): self.train = train self.dev = dev self.test = test # indexes will be populated by `_index`. self.Y = Alphabet() # tag set self.V = Alphabet() # vocabulary self.V_freq = Counter() # token unigram counts self.V2Y = defaultdict(set) # tag dictionary self.prefixes = Counter() self.suffixes = Counter() self._index(self.train) def _index(self, data): "frequency tables, etc." for sentence in data: for y, w in sentence: self.Y.add(y) self.V.add(w) self.V2Y[w].add(y) self.V_freq[w] += 1 for prefix in prefixes(w): self.prefixes[prefix] += 1 for suffix in suffixes(w): self.suffixes[suffix] += 1 def make_instances(self, fold, cls): "Convert tuples in data `fold` to instances of `cls`." data = [] for x in iterview(getattr(self, fold), msg='Features (%s)' % fold): tags, tokens = zip(*x) data.append(cls(tokens, self.Y.map(tags), self)) return data def tag_ngram_counts(self, n): "Returns tag ngram count for subsequences of length n." # Y = self.Y def tag_sequences(): """Iterate over tag sequence (as `str` instead of `int`, which is how they are stored.). """ for e in self.train: y, _ = zip(*e) # assert all(isinstance(yy, int) for yy in y), y # yield tuple(Y.lookup_many(y)) yield y return ngram_counts(tag_sequences(), n)
def __init__(self, train, dev, test): self.train = train self.dev = dev self.test = test # indexes will be populated by `_index`. self.Y = Alphabet() # tag set self.V = Alphabet() # vocabulary self.V_freq = Counter() # token unigram counts self.V2Y = defaultdict(set) # tag dictionary self.prefixes = Counter() self.suffixes = Counter() self._index(self.train)
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) A.add_many(f for token in x.sequence for f in token.attributes) # domains are now ready L.freeze() A.stop_growth() return (L, A)
def preprocess_bubs_format(bubs, output): """Convert grammar from bubs-parser into ldp-friendly csv format. The result is an equivalent grammar, which is much faster to load because it has been integerized. Given a gzipped grammar from bubs-parser, e.g. `eng.M2.gr.gz`, this function will generate four files: - eng.M2.gr.csv: grammar rules - eng.M2.lex.csv: lexical rules - eng.M2.lex.alphabet: mapping from terminals to integers - eng.M2.sym.alphabet: mapping from syms to integers """ sym = Alphabet() lex = Alphabet() import gzip lines = gzip.open(bubs, 'rb').readlines() reading_lex = False l = [] f = [] for line in iterview(lines[1:]): # drop first line if line.startswith('===== LEXICON'): reading_lex = True continue x = line.strip().split() if not x: continue lhs = x[0] rhs = tuple(b for b in x[2:-1]) score = x[-1] if len(rhs) == 1: rhs = (rhs[0], '') y, z = rhs lhs = sym[lhs] y = lex[y] if reading_lex else sym[y] z = sym[z] if z else -1 if reading_lex: l.append({'score': score, 'head': lhs, 'left': y}) else: f.append({'score': score, 'head': lhs, 'left': y, 'right': z}) # non-gzipped loads faster. #DataFrame(f).to_csv(gzip.open(output + '.gr.csv.gz', 'wb')) #DataFrame(l).to_csv(gzip.open(output + '.lex.csv.gz', 'wb')) DataFrame(f).to_csv(output + '.gr.csv') DataFrame(l).to_csv(output + '.lex.csv') sym.save(output + '.sym.alphabet') lex.save(output + '.lex.alphabet')
class CoraCitations(Dataset): def __init__(self, filename): self.Y = Alphabet() data = list( fromSGML(filename, linegrouper="<NEW.*?>", bioencoding=False)) np.random.shuffle(data) super(CoraCitations, self).__init__(train=data[len(data) // 5:], dev=data[:len(data) // 5], test=[]) self.train = self.make_instances('train', Instance) self.dev = self.make_instances('dev', Instance) def evaluate(self, predict, data, name, verbosity=1): if not data: return if verbosity: print() print('Phrase-based F1:', name) f1 = F1() for i, x in enumerate(iterview(data, msg='Eval %s' % name)): pred = extract_contiguous(predict(x)) gold = extract_contiguous(self.Y.lookup_many(x.tags)) # (i,begin,end) uniquely identifies the span for (label, begins, ends) in gold: f1.add_relevant(label, (i, begins, ends)) for (label, begins, ends) in pred: f1.add_retrieved(label, (i, begins, ends)) if verbosity: print() return f1.scores(verbose=verbosity >= 1)
def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001): self.train = train self.dev = dev self.test = test self.Sigma = Sigma assert self.Sigma[""] == 0 self.IL = IL self.C = C self.L = L self.eta = eta # X and Y self.X, self.Y = Alphabet(), Alphabet() self.X.add(""); self.Y.add("") for s, si in self.Sigma.items(): if si == 0: continue self.X.add(s) for s, si in self.Sigma.items(): if si == 0: continue self.Y.add(s) self.X.freeze(); self.Y.freeze() # first order (possibly extend) self.P = Alphabet() self.P.add("") for s, si in self.Sigma.items(): if si == 0: continue self.P.add(s) self.P.add("oo") self.P.add("nn") self.P.add("yy") self.P.add("ss") self.P.add("ee") self.P.freeze() # create Z self.Z = Alphabet() self.Z[""] = 0 for p, pi in self.P.items(): for o, oi in self.Y.items(): z = p+o self.Z.add(z) self.Z.freeze() # model self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL) self.features = TransducerFeatures(self.X, self.Y, self.P) self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4) self.updater.w[0] = 10.0 self.updater.w[1] = -10.0
def __init__(self, fin, source_lang, target_lang): # variables self.source_lang = source_lang self.target_lang = target_lang # intern the variables self.source = Alphabet() self.target = Alphabet() self.store = {} with codecs.open(fin, encoding="utf-8") as f: for line in f: line = line.strip() source, target, score = line.split(" ") #if "Buch" in source or "Stuhl" in source: score = float(score) self.store[(source, target)] = score self.source.add(source) self.target.add(target)
def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k,v in label_count: print '%20s => %s' % (k, v) sys.exit(0) F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
def integerize(data): """ Integerize dataset returns a triple (label alphabet, feature alphabet, integerized dataset) """ if do_label_count: label_count = defaultdict(int) for label, features in data: label_count[label] += 1 label_count = label_count.items() label_count.sort(key=lambda x: -x[1]) # sort by count print 'label count' for k, v in label_count: print '%20s => %s' % (k, v) sys.exit(0) F = Alphabet() L = Alphabet() I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data] return (L, F, I)
def preprocess_berkeley_format(input_prefix, output, coarsen=False): """ Preprocessing: convert PTB grammar into simple tsv format. """ def g(x): if coarsen: x = x.split('^')[0] x = x.split('_')[0] return x sym = Alphabet() lex = Alphabet() lexical_rules = [] for x in file(input_prefix + '.lexicon'): [(x, y, s)] = re.findall(r'(\S+)\s+(\S+)\s*\[(.*?)\]', x) s = float(s) x = g(x) y = g(y) lexical_rules.append({'score': log(s), 'head': sym[x], 'left': lex[y]}) rules = [] for x in file(input_prefix + '.grammar'): x, y = x.split(' -> ') y = y.split() if len(y) == 2: y, s = y s = float(s) z = -1 else: assert len(y) == 3 y, z, s = y s = float(s) x = g(x) y = g(y) if x == y and z == -1: continue x = sym[x] y = sym[y] if z != -1: z = g(z) z = sym[z] rules.append({'score': log(s), 'head': x, 'left': y, 'right': z}) DataFrame(rules).to_csv(output + '.gr.csv') DataFrame(lexical_rules).to_csv(output + '.lex.csv') sym.save(output + '.sym.alphabet') lex.save(output + '.lex.alphabet')
def vectorize(self, segmentations, maxn=200, threshold=5): """ vectorize to features for theano """ lookup = {'11': 0, '22': 1} index = Alphabet() count = dd(int) for segmentation in segmentations: for segment in segmentation: if segment not in lookup: lookup[segment] = len(lookup) count[segment] += 1 # create vectors self.N = threshold for k, v in count.items(): if v > threshold: index.add(k) self.N += 1 seg2vec = {} for seg, i in lookup.items(): if i < 2: continue vec = zeros((self.N)) if (self.d.check(seg) or self.d.check(seg.title())) and len(seg) > 3: vec[0] = 1.0 elif len(seg) > 3: vec[1] = 1.0 if count[seg] > threshold: vec[index[seg] + 2] = 1.0 seg2vec[seg] = vec # segmentation2vec self.segmentation2vec = {} for segmentation in segmentations: f = zeros((self.N)) for segment in segmentation: f += seg2vec[segment] self.segmentation2vec[' '.join(segmentation)] = f
class StringFeatures(object): """ String features """ def __init__(self, prefix_length=0, suffix_length=4): self.prefix_length, self.suffix_length = prefix_length, suffix_length self.attributes = Alphabet() self.word2attributes = {} self.words = Alphabet() def get_attributes(self, word, extract=False): """ extract the features """ lst = [] for i in xrange(1, self.prefix_length+1): if i > len(word): break prefix = word[:i] name = "PREFIX: "+prefix if extract: self.attributes.add(name) if name in self.attributes: lst.append(self.attributes[name]) for i in xrange(1, self.suffix_length+1): if i < 0: break suffix = word[-i:] name = "SUFFIX: "+suffix if extract: self.attributes.add(name) if name in self.attributes: lst.append(self.attributes[name]) return lst def store(self, word): """ store the features """ self.words.add(word) i = self.words[word] self.word2attributes[i] = self.get_attributes(word, True) def __len__(self): return len(self.attributes) def __getitem__(self, word): if word in self.words: i = self.words[word] return self.word2attributes[i] # don't extract return self.get_attributes(word, False)
def __init__(self, fin, atts, vals, av, avs): # probably redundant... # but not optimizing for space so who cares self.atts, self.vals, self.av, self.avs = atts, vals, av, avs self.lexicon = dd(list) self.words = Alphabet() with codecs.open(fin, encoding="utf-8") as f: for line in f: line = line.strip() if line == "": continue word, lemma, tags = line.split(" ") self.words.add(word) tags = tags.split(",") for tag in tags: if len(tag.split("=")) != 2: print line print tag a, v = tag.split("=") self.av[a].add(v) self.atts.add(a) self.vals.add(v) self.lexicon[word].append((lemma, tags)) # get rid of default dict wrapper self.lexicon = dict(self.lexicon) self.av = dict(self.av) for a, s in self.av.items(): for v in s: self.avs.add((a, v))
class Segmenter(object): """ Segmenter """ def __init__(self, train, dev, test, decode_type, split_num, log_fname=None, segmenter_type='tree', G=3, weights='weights', alphabet=None, T_L=2, T_eta=1.0, T_C=0.0000001, S_L=2, S_eta=1.0, S_C=0.00000001): # set up the logging system log = logging.getLogger('') log.setLevel(logging.INFO) format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.handlers.RotatingFileHandler(log_fname, maxBytes=(1048576 * 5), backupCount=7) fh.setFormatter(format) log.addHandler(fh) self.G = G self.split_num = split_num self.segmenter_type = segmenter_type self.decode_type = decode_type self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C self.sig = "split={0},L={1},C={2},eta={3},type={4}".format( *(self.split_num, self.T_L, self.T_C, self.T_eta, self.decode_type)) logging.info("Transducer Regularizer Type: L={0}".format(T_L)) logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C)) logging.info("Transducer Learning Rate: eta={0}".format(T_eta)) logging.info("Segmenter Regularizer Type: L={0}".format(S_L)) logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C)) logging.info("Segmenter Learning Rate: eta={0}".format(S_eta)) self.weights = weights self.Sigma = Alphabet() self.Sigma.add("") # add epsilon at 0 self.Sigma.add("o") self.Sigma.add("n") self.Sigma.add("y") self.Sigma.add("s") self.Sigma.add("e") if alphabet is not None: self.Sigma = self.Sigma.load(alphabet) # processs the data # TODO: modularize self.train = self.process(train, 100000) self.dev = self.process(dev, 1000) self.test = self.process(test, 1000) # dump the alphabet self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num)) # create model self.segmenter = None logging.info("Segmenter Type: {0}".format(self.segmenter_type)) logging.info("Decoder Type: {0}".format(self.decode_type)) if self.segmenter_type == TREE: self.segmenter = TreeSegmenter(G) self.features = TreeFeatures(self.G, 1000) elif self.segmenter_type == CHUNK: self.segmenter = ChunkSegmenter(G) self.features = ChunkFeatures(self.G, 1000) else: raise Exception('Illicit Model Type') # transducer self.transducer = TransducerModel(self.train, self.dev, self.test, self.Sigma, L=self.T_L, eta=self.T_eta, C=self.T_C) # extract features self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') # dimension of data self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=2, C=self.S_C, eta=0.1, fudge=1e-4) self.updater.w[0] = 10 self.updater.w[1] = 10 def save_transducer(self, directory, i): """ save the transducer weights """ np.save(directory + "/transducer-{0}-{1}.npy".format(*(self.sig, i)), array(self.transducer.updater.w)) def save_segmenter(self, directory, i): np.save(directory + "/segmenter-{0}-{1}.npy".format(*(self.sig, i)), array(self.updater.w)) def save(self, directory): self.save_transducer(directory, 'final') self.save_segmenter(directory, 'final') def optimize(self, t=None, load=False, transducer=None, segmenter=None, iterations=20): """ optimize the model """ if load: assert transducer is not None #assert segmenter is not None self.load_weights(transducer, segmenter) if t is None: return elif t == JOINT: self.optimize_joint(iterations) elif t == PIPE: self.optimize_pipeline(iterations) def load_weights(self, transducer, segmenter): """ load weights """ self.transducer.updater.w = np.load(transducer) self.updater.w = np.load(segmenter) def optimize_pipeline(self, iterations=10): """ optimize """ for i in xrange(iterations): self.transducer.optimize(1, i) train_acc = self.transducer.evaluate(self.train) dev_acc = self.transducer.evaluate(self.dev) test_acc = self.transducer.evaluate(self.test) logging.info( "transducer epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info( "transducer epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info( "transducer epoch {0} test acc: {1}".format(*(i, test_acc))) self.save_transducer(self.weights, i) print self.transducer.evaluate(self.dev) if self.segmenter_type == TREE: self.optimize_tree(iterations) elif self.segmenter_type == CHUNK: self.optimize_chunk(iterations) def optimize_chunk(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): gt0, gt, ge = self.features.potentials_catchup( tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like( ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) self.features.update(tree, dge, dgt0, dgt, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("chunk epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("chunk epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("chunk epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1))) def optimize_tree(self, iterations): """ optimize the model """ for i in xrange(iterations): for tree in iterview(self.train, colored('Pass %s' % (i + 1), 'blue')): psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 self.save_segmenter(self.weights, i) self.decode(self.train, VITERBI) self.decode(self.dev, VITERBI) test_acc, test_f1 = self.decode(self.test, VITERBI) logging.info("tree epoch {0} train acc: {1}".format(*(i, train_acc))) logging.info("tree epoch {0} dev acc: {1}".format(*(i, dev_acc))) logging.info("tree epoch {0} test acc: {1}".format(*(i, test_acc))) logging.info("tree epoch {0} train f1: {1}".format(*(i, train_f1))) logging.info("tree epoch {0} dev f1: {1}".format(*(i, dev_f1))) logging.info("tree epoch {0} test f1: {1}".format(*(i, test_f1))) def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0): """ optimize jointly using importance sampling """ # TODO: unit test self.updater.eta = eta1 for i in xrange(iterations): samples = self.transducer.sample(self.train, num=num_samples) for tree, sample in iterview(zip(self.train, samples), colored('Pass %s' % (i + 1), 'blue')): # compute approximate partition function logZ = NINF strings, weights = [], [] for (ur, count) in sample.items(): score = self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # TODO: double check logZ = logaddexp(logZ, score) weights.append(score) strings.append(ur) #TODO: make more elegant tmp = [] for weight in weights: tmp.append(weight - logZ) # TODO: double check weights = tmp # take a tranducer weight gradient step with the importance sampling self.transducer.step_is(tree, strings, weights, eta=eta2) # take a segmenter weight gradient step with the importance sampling for ur, weight in zip(sample, weights): if self.segmenter_type == CHUNK: self.is_chunk(tree, ur, weight) elif self.segmenter_type == TREE: self.is_tree(tree, ur, weight) self.updater.step += 1 def is_chunk(self, tree, ur, weight): """ importance sampling gradient step tree """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(ge) self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt) dgt0 *= weight dgt *= weight dge *= weight self.features.update(tree, dge, dgt0, dgt, self.updater) def is_tree(self, tree, ur, weight): """ importance sampling gradient step chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = self.segmenter.dll(tree, psi) dpsi *= weight self.features.update(tree, dpsi, self.updater) def baseline_ur(self, data): """ baseline ur """ for tree in iterview(data, colored('Updating Baseline UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.sr) def decode_ur(self, data): """ decodes the UR """ for tree in iterview(data, colored('Updating Viterbi UR', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) def oracle_ur(self, data): """ uses the oracle UR """ for tree in iterview(data, colored('Updating Oracle UR', 'red')): tree.ur_samples = [] tree.ur_samples.append(tree.ur_gold) def sample_ur(self, data, num_samples=1000): """ samples the UR """ samples = self.transducer.sample(data, num=num_samples) for tree, samples in iterview(zip(data, samples), colored('Sampling', 'red')): tree.ur_samples = [] viterbi_ur = self.transducer.decode(tree)[1] tree.ur_samples.append(viterbi_ur) for sample, count in samples.items(): tree.ur_samples.append(sample) def decode_chunk(self, tree, ur): """ decodes a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) best, segments, labels = self.segmenter.decode(tree, ge, gt0, gt) truth = [tree.ur[i:j] for i, j in tree.indices] guess = [tree.ur[i:j] for i, j in segments] return truth, guess def decode_tree(self, tree, ur): """ decodes a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) max_score, tree_string, max_spans = self.segmenter.argmax( tree.M, self.G, psi, tree.ur) gold_spans = set(tree.spans) guess_spans = set(tree.spans) p, r = 0.0, 0.0 for span in gold_spans: if span in guess_spans: p += 1.0 p /= len(gold_spans) for span in guess_spans: if span in gold_spans: r += 1.0 r /= len(guess_spans) f1 = (2 * p * r) / (p + r) # TODO: horrible hack segmentation = tree_string.replace("(", "").replace(")", "").replace(" ", "") for i in xrange(100): segmentation = segmentation.replace(str(i), "") segmentation = segmentation.split(":") guess = segmentation[:-1] truth = [x[0] for x in to_segmentation(tree.root)] return truth, guess, f1 def score_chunk(self, tree, ur): """ scores a chunk """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(ge, gt0, gt, M) def score_tree(self, tree, ur): """ scores a tree """ tree.update_ur(ur) self.features.featurize_instance(tree) M = tree.M psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.segmenter.logZ(M, self.G, psi) def decode(self, data, data_type, decode_type=None, sep=u"#"): """ decode the chunker """ if decode_type is None: decode_type = self.decode_type if decode_type == ORACLE: self.oracle_ur(data) elif decode_type == BASELINE: self.baseline_ur(data) elif decode_type == VITERBI: self.decode_ur(data) elif decode_type == SAMPLE: self.sample_ur(data) else: raise Exception('Illicit Decode Type') ur_correct, ur_total = 0, 0 correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0 for tree in iterview(data, colored('Decoding', 'red')): max_ur, max_score = None, NINF counter = 0 for ur in tree.ur_samples: tree.update_ur(ur) counter += 1 score = 0.0 score = self.transducer.ll(tree, ur) #print #print "LL", self.transducer.ll(tree, ur) if self.segmenter_type == CHUNK: score += self.score_chunk(tree, ur) #print "SCORE", self.score_chunk(tree, ur) #print ur #raw_input() elif self.segmenter_type == TREE: score += self.score_tree(tree, ur) # take the best importance sample if score >= max_score: max_score = score max_ur = ur #print "counter", counter if max_ur == tree.ur_gold: ur_correct += 1 ur_total += 1 truth, guess, tree_f1_tmp = None, None, None if self.segmenter_type == CHUNK: truth, guess = self.decode_chunk(tree, max_ur) elif self.segmenter_type == TREE: truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur) tree_f1 += tree_f1_tmp # ACCURACY if truth == guess: correct += 1 # LEVENSHTEIN lev += Levenshtein.distance(sep.join(truth), sep.join(guess)) # F1 set1, set2 = set(guess), set(truth) p, r = 0, 0 for e in set1: if e in set2: p += 1 for e in set2: if e in set1: r += 1 p /= len(set1) r /= len(set2) if p + r > 0: f1 += 2 * p * r / (p + r) total += 1 logging.info("decoder type: {0}".format(decode_type)) logging.info("{0} ur acc: {1}".format(*(data_type, ur_correct / total))) logging.info("{0} seg acc: {1}".format(*(data_type, correct / total))) logging.info("{0} f1: {1}".format(*(data_type, f1 / total))) logging.info("{0} edit: {1}".format(*(data_type, lev / total))) if self.segmenter_type == TREE: logging.info("{0} tree f1: {1}".format(*(data_type, tree_f1 / total))) def process(self, fname, maximum=100): """ Put the string data into the data structures necessary for training and decoding the model. """ processed = [] data = Data(fname) for counter, (sr, (tree, (indices, index_labels))) in enumerate(data): if counter == maximum: break ur = to_string(tree) for s in list(sr): self.Sigma.add(s) for s in list(ur): self.Sigma.add(s) spans, labels = [], [] for node in walk(tree): spans.append((node.i, node.j)) labels.append(node.label) t = Tree(self.G, sr, ur, spans, labels, indices, index_labels, len(spans), tree) processed.append(t) return processed
psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY) score, ur2 = results[0], results[1] if ur1 == ur2: correct += 1 print ur1, ur2 total += 1 counter += 1 print return float(correct) / total def ll(self, tree, ur): """ gets the log-likelihood """ psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY) if __name__ == "__main__": data = [("hablar", "hablando"), ("comer", "comiendo")] Sigma = Alphabet() Sigma.add("") for (x, y) in data: for c in list(x): Sigma.add(c) for c in list(y): Sigma.add(c) tm = TransductionModel(Sigma, data) profile.runctx("tm.train()", locals(), globals())
class TransducerModel(object): """ Transducer model """ def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001): self.train = train self.dev = dev self.test = test self.Sigma = Sigma assert self.Sigma[""] == 0 self.IL = IL self.C = C self.L = L self.eta = eta # X and Y self.X, self.Y = Alphabet(), Alphabet() self.X.add(""); self.Y.add("") for s, si in self.Sigma.items(): if si == 0: continue self.X.add(s) for s, si in self.Sigma.items(): if si == 0: continue self.Y.add(s) self.X.freeze(); self.Y.freeze() # first order (possibly extend) self.P = Alphabet() self.P.add("") for s, si in self.Sigma.items(): if si == 0: continue self.P.add(s) self.P.add("oo") self.P.add("nn") self.P.add("yy") self.P.add("ss") self.P.add("ee") self.P.freeze() # create Z self.Z = Alphabet() self.Z[""] = 0 for p, pi in self.P.items(): for o, oi in self.Y.items(): z = p+o self.Z.add(z) self.Z.freeze() # model self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL) self.features = TransducerFeatures(self.X, self.Y, self.P) self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4) self.updater.w[0] = 10.0 self.updater.w[1] = -10.0 def optimize(self, iterations=10, start=0): """ optimize the model """ #np.random.shuffle(self.train) for i in xrange(iterations): for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) dpsi = zeros_like(psi) x, y = instance.sr, instance.ur #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY) dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY) self.features.update(instance, dpsi, self.updater) self.updater.step += 1 def step_is(self, tree, strings, weights, eta=0.0): """ optimize the model """ self.updater.eta = eta psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) dpsi = zeros_like(psi) dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY) self.features.update(tree, dpsi, self.updater) self.updater.step += 1 def sample(self, data, num=1000): """ sample """ samples = [] inside = 0 correct1, correct2, total = 0, 0, 0 for instance in iterview(data, colored('Sampling', 'green')): psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) sr = instance.sr dist = {} for s in self.model.sample(sr, psi, num=num): output = "" for x, y in s: output += y if output not in dist: dist[output] = 0 dist[output] += 1 count = dist[instance.ur_gold] if instance.ur_gold in dist else 0 decoded = self.decode(instance)[1] if decoded != instance.ur_gold and count > 0: inside += 1 if decoded == instance.ur_gold: correct1 += 1 if instance.ur_gold in dist: correct2 += 1 total += 1 samples.append(dist) # TODO: put into log #print ; print inside #print correct1 / total, correct2 / total return samples def decode(self, instance): """ Decodes an instance """ psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) return results def evaluate(self, data, maximum=100000000): """ decode the model """ correct, total = 0, 0 counter = 0 for instance in iterview(data, colored('Decoding', 'red')): if counter == maximum: break psi = self.features.potentials_catchup(instance, self.updater.w, self.updater) ur1 = instance.ur results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY) ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY) score, ur2 = results[0], results[1] if ur1 == ur2: correct += 1 print ur1, ur2 total += 1 counter += 1 print return float(correct) / total def ll(self, tree, ur): """ gets the log-likelihood """ psi = self.features.potentials_catchup(tree, self.updater.w, self.updater) return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
from arsenal.alphabet import Alphabet from arsenal.iterview import progress from arsenal.terminal import colors from collections import Counter, defaultdict from grafl.test import make_model_func from grafl.dataset.edge_dataset import BWD_dataset np.set_printoptions(precision=4) L = { 0: 'coordinate', 1: 'hypernym', 2: 'hyponym', } A = Alphabet() A.map([x.strip().split()[1] for i, x in enumerate(file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2]) tst = BWD_dataset('test').data trn = BWD_dataset('train').data trn_x = trn[0] trn_y = trn[1] seen = set(trn_x.flatten()) | set(trn_y.flatten()) X,Y,_ = tst X = list(A.lookup_many(X.flatten())) Y = list(A.lookup_many(Y.flatten())) #D = np.array([X,Y,L.flatten()]).T model_file = 'res/experiments/BWD-projection-Softmax_best.pkl'
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) # add labels to label domain # extract features of the target path F = x.F path = x.truth A.add_many(F(0, None, path[0])) A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t])) # domains are now ready L.freeze() A.stop_growth() return (L, A)
def __init__(self, prefix_length=0, suffix_length=4): self.prefix_length, self.suffix_length = prefix_length, suffix_length self.attributes = Alphabet() self.word2attributes = {} self.words = Alphabet()
def build_domain(data): """ Do feature extraction to determine the set of *supported* featues, i.e. those active in the ground truth configuration and active labels. This function will each features and label an integer. """ L = Alphabet() A = Alphabet() for x in data: L.add_many(x.truth) # add labels to label domain # extract features of the target path F = x.F path = x.truth A.add_many(F(0, None, path[0])) A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t - 1], path[t])) # domains are now ready L.freeze() A.stop_growth() return (L, A)
class Lexicon(object): """ Reads in the universal morpholigcal lexicon """ def __init__(self, fin, atts, vals, av, avs): # probably redundant... # but not optimizing for space so who cares self.atts, self.vals, self.av, self.avs = atts, vals, av, avs self.lexicon = dd(list) self.words = Alphabet() with codecs.open(fin, encoding="utf-8") as f: for line in f: line = line.strip() if line == "": continue word, lemma, tags = line.split(" ") self.words.add(word) tags = tags.split(",") for tag in tags: if len(tag.split("=")) != 2: print line print tag a, v = tag.split("=") self.av[a].add(v) self.atts.add(a) self.vals.add(v) self.lexicon[word].append((lemma, tags)) # get rid of default dict wrapper self.lexicon = dict(self.lexicon) self.av = dict(self.av) for a, s in self.av.items(): for v in s: self.avs.add((a, v)) def create_vectors(self): self.N = len(self.avs) self.W = zeros((len(self.lexicon), self.N)) # use Manaal's encoding (http://arxiv.org/abs/1512.05030) for w, lst in self.lexicon.items(): vec = zeros((self.N)) for l, ts in lst: for tag in ts: a, v = tag.split("=") #if a != "pos": # continue j = self.avs[(a, v)] vec[j] = 1.0 i = self.words[w] self.W[i] = vec def pp(self, word): """ pretty print the morphological tag of a word """ i = self.words[word] lst = [] for n in xrange(self.N): if self.W[i, n] > 0: lst.append("=".join(self.avs.lookup(n))) return word, ",".join(lst) def __getitem__(self, word): i = self.words[word] return self.W[i]
def __init__(self, train, dev, test, decode_type, split_num, log_fname=None, segmenter_type='tree', G=3, weights='weights', alphabet=None, T_L=2, T_eta=1.0, T_C=0.0000001, S_L=2, S_eta=1.0, S_C=0.00000001): # set up the logging system log = logging.getLogger('') log.setLevel(logging.INFO) format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s") ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = logging.handlers.RotatingFileHandler(log_fname, maxBytes=(1048576 * 5), backupCount=7) fh.setFormatter(format) log.addHandler(fh) self.G = G self.split_num = split_num self.segmenter_type = segmenter_type self.decode_type = decode_type self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C self.sig = "split={0},L={1},C={2},eta={3},type={4}".format( *(self.split_num, self.T_L, self.T_C, self.T_eta, self.decode_type)) logging.info("Transducer Regularizer Type: L={0}".format(T_L)) logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C)) logging.info("Transducer Learning Rate: eta={0}".format(T_eta)) logging.info("Segmenter Regularizer Type: L={0}".format(S_L)) logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C)) logging.info("Segmenter Learning Rate: eta={0}".format(S_eta)) self.weights = weights self.Sigma = Alphabet() self.Sigma.add("") # add epsilon at 0 self.Sigma.add("o") self.Sigma.add("n") self.Sigma.add("y") self.Sigma.add("s") self.Sigma.add("e") if alphabet is not None: self.Sigma = self.Sigma.load(alphabet) # processs the data # TODO: modularize self.train = self.process(train, 100000) self.dev = self.process(dev, 1000) self.test = self.process(test, 1000) # dump the alphabet self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num)) # create model self.segmenter = None logging.info("Segmenter Type: {0}".format(self.segmenter_type)) logging.info("Decoder Type: {0}".format(self.decode_type)) if self.segmenter_type == TREE: self.segmenter = TreeSegmenter(G) self.features = TreeFeatures(self.G, 1000) elif self.segmenter_type == CHUNK: self.segmenter = ChunkSegmenter(G) self.features = ChunkFeatures(self.G, 1000) else: raise Exception('Illicit Model Type') # transducer self.transducer = TransducerModel(self.train, self.dev, self.test, self.Sigma, L=self.T_L, eta=self.T_eta, C=self.T_C) # extract features self.features.featurize(self.train, 'train') self.features.featurize(self.dev, 'dev') self.features.featurize(self.test, 'test') # dimension of data self.d = 2**22 + self.features.offset self.updater = LazyRegularizedAdagrad(self.d, L=2, C=self.S_C, eta=0.1, fudge=1e-4) self.updater.w[0] = 10 self.updater.w[1] = 10
class PTB(object): "Load the POS-tagged Penn Treebank." def __init__(self, base, coarse=True): self.base = base self.coarse = coarse self.Y = Alphabet() # tag set self.V, self.V_freq = Alphabet(), {} # vocabulary self.V2Y, self.Y2V = dd(set), dd(set) self.train, self.dev, self.test = [], [], [] self.prefixes, self.suffixes = {}, {} self.prefix2int, self.suffix2int = {}, {} # Read data and create standard splits according to # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art) # # train split [0,18] for sectionid in range(19): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.train.append(sentence) for y, w in sentence: self.V.add(w) self.V2Y[w].add(self.Y.lookup(y)) self.Y2V[self.Y.lookup(y)].add(w) if w not in self.V_freq: self.V_freq[w] = 0 self.V_freq[w] += 1 for prefix in self.extract_prefixes(w): if prefix not in self.prefixes: self.prefixes[prefix] = 0 self.prefixes[prefix] += 1 for suffix in self.extract_suffixes(w): if suffix not in self.suffixes: self.suffixes[suffix] = 0 self.suffixes[suffix] += 1 # dev split [19,21] for sectionid in range(19, 22): read = self.read_section(sectionid) for sentence in read: #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.dev.append(sentence) # test split [22,24] for sectionid in range(22, 25): #for tag, word in sentence: # if tag == self.Y["BAD"]: # break self.test.extend(self.read_section(sectionid)) self.Y.freeze() def extract_prefixes(self, w, n=10): """ gets prefixes up to length n """ prefixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[:i] if segment not in self.prefix2int: self.prefix2int[segment] = len(self.prefix2int) prefixes.append(w[:i]) return prefixes def extract_suffixes(self, w, n=10): """ gets suffixes up to lenght n """ suffixes = [] for i in range(1, min(len(w)+1, n+1)): segment = w[-i:] if segment not in self.suffix2int: self.suffix2int[segment] = len(self.suffix2int) suffixes.append(w[-i:]) return suffixes def tag_bigrams(self): """ extract all tag bigrams """ bigram2count = {} for sentence in self.train: for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]): key = (tag1, tag2) if key not in bigram2count: bigram2count[key] = 0 bigram2count[key] += 1 return bigram2count def read_section(self, sectionid): "Read a section number `sectionid` from the PTB." root = os.path.join(self.base, str(sectionid).zfill(2)) for fname in os.listdir(root): if not fname.endswith('pos.gz'): continue with gzip.open(os.path.join(root, fname), 'rb') as f: for chunk in f.read().split('======================================'): if chunk.strip(): if self.coarse: # STUPID BIO ENCODING #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)] # Note: clean up punc reduction yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)] else: # TODO: what to do able bars in the tags? # FIND OUT AND CLEAN UP yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)] def pp(self, sentence): "Pretty print." return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
from arsenal.alphabet import Alphabet from arsenal.iterview import progress from arsenal.terminal import colors from collections import Counter, defaultdict from grafl.test import make_model_func from grafl.dataset.edge_dataset import BWD_dataset np.set_printoptions(precision=4) L = { 0: 'coordinate', 1: 'hypernym', 2: 'hyponym', } A = Alphabet() A.map([ x.strip().split()[1] for i, x in enumerate( file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2 ]) tst = BWD_dataset('test').data trn = BWD_dataset('train').data trn_x = trn[0] trn_y = trn[1] seen = set(trn_x.flatten()) | set(trn_y.flatten()) X, Y, _ = tst X = list(A.lookup_many(X.flatten()))
def main(): datafile = sys.argv[1] train, test = traintest(datafile) print 'train: %s, test: %s' % (len(train), len(test)) from scipy.sparse import dok_matrix from sklearn import linear_model from sklearn.svm import SVC from arsenal.alphabet import Alphabet N_FEATURES = 100000 alphabet = Alphabet(random_int=N_FEATURES) def _f1(name, data, c, verbose=True): if verbose: print print name f = F1() for (i, x) in enumerate(data): phi = dok_matrix((1, N_FEATURES)) for k in x.features: phi[0, alphabet[k] % N_FEATURES] = 1.0 [y] = c.predict(phi) f.report(i, y, x.label) f.scores(verbose=verbose) return f X = dok_matrix((len(train), N_FEATURES)) M = len(train) Y = [] X = dok_matrix((M, N_FEATURES)) for i, x in enumerate(train): # binary features for k in x.features: X[i, alphabet[k] % N_FEATURES] = 1.0 Y.append(x.label) X = X.tocsc() c = SVC(class_weight={ 'author': 1000, 'title': 1000, 'other': 1.0 }, verbose=1) c.fit(X, Y) _f1('train', train, c) ff = _f1('test', test, c, verbose=1) if 0: import numpy as np import matplotlib.pyplot as pl from mpl_toolkits.mplot3d import Axes3D ax = pl.figure().add_subplot(111, projection='3d') pl.ion() data = [] for (author_weight, title_weight) in iterview(np.random.uniform(1, 10, size=(100, 2))): print print 'params:', (author_weight, title_weight) c = SVC(class_weight={ 'author': author_weight, 'title': title_weight, 'other': 1.0 }, verbose=1) #c = linear_model.SGDClassifier() c.fit(X, Y) #_f1('train', train, c) ff = _f1('test', test, c, verbose=1) score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0)) data.append((author_weight, title_weight, score)) print 'score:', score x, y, z = zip(*data) ax.clear() ax.scatter(x, y, z) ax.figure.canvas.draw() print 'done' pl.ioff() pl.show()
def main(): from arsenal.alphabet import Alphabet from arsenal.maths import spherical D = 100 alphabet = Alphabet(random_int=D) weights = spherical(D) direction = spherical(D) #sentence = 'Papa ate the caviar with the spoon in the park .'.split() sentence = 'Papa ate the caviar with the spoon .'.split() if 0: grammar = """ S S . S NP VP NP D N NP NP PP VP V NP VP VP PP PP P NP NP Papa N caviar N spoon N park V ate P with P in D the """ else: grammar = """ S X . X X X X Papa X ate X the X caviar X with X spoon X in X park """ rhs = load_grammar(grammar) if 1: # This code branch enumerates all (exponentially many) valid # derivations. root = semiring_enumeration(sentence, rhs) for d in root.x: print(post_process(d)) assert len(root.x) == len(set(root.x)) def binary_features(sentence,X,Y,Z,i,j,k): return alphabet.map(['%s -> %s %s [%s,%s,%s]' % (X,Y,Z,i,j,k)]) def unary_features(sentence,X,Y,i,k): return alphabet.map(['%s -> %s [%s,%s]' % (X,Y,i,k)]) root = semiring_mert(sentence, rhs, weights, direction, binary_features, unary_features) mert_derivations = [] #set() for x in root.points: print(x) d = x.derivation() mert_derivations.append(d) assert len(mert_derivations) == len(set(mert_derivations)) #root.draw() # Compare the set of derivations found by the MERT semiring to 'brute force' # linesearch. Note: Linesearch might only find a subset of derivations found # by MERT if the grid isn't fine enough. brute_derivations = set() for step in np.linspace(-20,20,1000): root = semiring_linesearch(sentence, rhs, weights, step, direction, binary_features, unary_features) d = root.derivation() brute_derivations.add(d) # NOTE: need to take upper hull of mert (so it's currently an over estimate) print('mert:', len(mert_derivations)) print('brute:', len(brute_derivations)) assert brute_derivations.issubset(mert_derivations)