Пример #1
0
class Dataset(object):

    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test
        # indexes will be populated by `_index`.
        self.Y = Alphabet()          # tag set
        self.V = Alphabet()          # vocabulary
        self.V_freq = Counter()      # token unigram counts
        self.V2Y = defaultdict(set)  # tag dictionary
        self.prefixes = Counter()
        self.suffixes = Counter()
        self._index(self.train)

    def _index(self, data):
        "frequency tables, etc."
        for sentence in data:
            for y, w in sentence:
                self.Y.add(y)
                self.V.add(w)
                self.V2Y[w].add(y)
                self.V_freq[w] += 1
                for prefix in prefixes(w):
                    self.prefixes[prefix] += 1
                for suffix in suffixes(w):
                    self.suffixes[suffix] += 1

    def make_instances(self, fold, cls):
        "Convert tuples in data `fold` to instances of `cls`."
        data = []
        for x in iterview(getattr(self, fold), msg='Features (%s)' % fold):
            tags, tokens = zip(*x)
            data.append(cls(tokens, self.Y.map(tags), self))
        return data

    def tag_ngram_counts(self, n):
        "Returns tag ngram count for subsequences of length n."

#        Y = self.Y

        def tag_sequences():
            """Iterate over tag sequence (as `str` instead of `int`, which is how they are
            stored.).

            """
            for e in self.train:
                y, _ = zip(*e)
#                assert all(isinstance(yy, int) for yy in y), y
#                yield tuple(Y.lookup_many(y))
                yield y

        return ngram_counts(tag_sequences(), n)
Пример #2
0
class Dictionary(object):
    """ Reads in the dictionary with confidence scores """

    def __init__(self, fin, source_lang, target_lang):
        # variables
        self.source_lang = source_lang
        self.target_lang = target_lang
        
        # intern the variables
        self.source = Alphabet()
        self.target = Alphabet()
        
        self.store = {}
        with codecs.open(fin, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                source, target, score = line.split(" ")
                #if "Buch" in source or "Stuhl" in source:
                score = float(score)
                self.store[(source, target)] = score
                self.source.add(source)
                self.target.add(target)
    def vectorize(self, segmentations, maxn=200, threshold=5):
        """ vectorize to features for theano """

        lookup = {'11': 0, '22': 1}
        index = Alphabet()
        count = dd(int)
        for segmentation in segmentations:
            for segment in segmentation:
                if segment not in lookup:
                    lookup[segment] = len(lookup)
                count[segment] += 1
        # create vectors
        self.N = threshold
        for k, v in count.items():
            if v > threshold:
                index.add(k)
                self.N += 1
        seg2vec = {}
        for seg, i in lookup.items():
            if i < 2:
                continue
            vec = zeros((self.N))
            if (self.d.check(seg)
                    or self.d.check(seg.title())) and len(seg) > 3:
                vec[0] = 1.0
            elif len(seg) > 3:
                vec[1] = 1.0
            if count[seg] > threshold:
                vec[index[seg] + 2] = 1.0
            seg2vec[seg] = vec

        # segmentation2vec
        self.segmentation2vec = {}
        for segmentation in segmentations:
            f = zeros((self.N))
            for segment in segmentation:
                f += seg2vec[segment]
            self.segmentation2vec[' '.join(segmentation)] = f
Пример #4
0
class StringFeatures(object):
    """ String features """
    
    def __init__(self, prefix_length=0, suffix_length=4):
        self.prefix_length, self.suffix_length = prefix_length, suffix_length
        self.attributes = Alphabet()
        self.word2attributes = {}
        self.words = Alphabet()

        
    def get_attributes(self, word, extract=False):
        """ extract the features """
        
        lst = []
        for i in xrange(1, self.prefix_length+1):
            if i > len(word):
                break
            prefix = word[:i]
            name = "PREFIX: "+prefix
            if extract:
                self.attributes.add(name)
            if name in self.attributes:
                lst.append(self.attributes[name])
            
        for i in xrange(1, self.suffix_length+1):
            if i < 0:
                break
            suffix = word[-i:]
            name = "SUFFIX: "+suffix
            if extract:
                self.attributes.add(name)
            if name in self.attributes:
                lst.append(self.attributes[name])

        return lst
                

    def store(self, word):
        """ store the features """

        self.words.add(word)
        i = self.words[word] 
        self.word2attributes[i] = self.get_attributes(word, True)

        
    def __len__(self):
        return len(self.attributes)

    
    def __getitem__(self, word):
        if word in self.words:
            i = self.words[word]
            return self.word2attributes[i]
        # don't extract
        return self.get_attributes(word, False)
Пример #5
0
class Segmenter(object):
    """ Segmenter """
    def __init__(self,
                 train,
                 dev,
                 test,
                 decode_type,
                 split_num,
                 log_fname=None,
                 segmenter_type='tree',
                 G=3,
                 weights='weights',
                 alphabet=None,
                 T_L=2,
                 T_eta=1.0,
                 T_C=0.0000001,
                 S_L=2,
                 S_eta=1.0,
                 S_C=0.00000001):
        # set up the logging system
        log = logging.getLogger('')
        log.setLevel(logging.INFO)
        format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s")

        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(format)
        log.addHandler(ch)

        fh = logging.handlers.RotatingFileHandler(log_fname,
                                                  maxBytes=(1048576 * 5),
                                                  backupCount=7)
        fh.setFormatter(format)
        log.addHandler(fh)

        self.G = G
        self.split_num = split_num
        self.segmenter_type = segmenter_type
        self.decode_type = decode_type
        self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C
        self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C
        self.sig = "split={0},L={1},C={2},eta={3},type={4}".format(
            *(self.split_num, self.T_L, self.T_C, self.T_eta,
              self.decode_type))
        logging.info("Transducer Regularizer Type: L={0}".format(T_L))
        logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C))
        logging.info("Transducer Learning Rate: eta={0}".format(T_eta))
        logging.info("Segmenter Regularizer Type: L={0}".format(S_L))
        logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C))
        logging.info("Segmenter Learning Rate: eta={0}".format(S_eta))

        self.weights = weights
        self.Sigma = Alphabet()
        self.Sigma.add("")  # add epsilon at 0
        self.Sigma.add("o")
        self.Sigma.add("n")
        self.Sigma.add("y")
        self.Sigma.add("s")
        self.Sigma.add("e")

        if alphabet is not None:
            self.Sigma = self.Sigma.load(alphabet)

        # processs the data
        # TODO: modularize
        self.train = self.process(train, 100000)
        self.dev = self.process(dev, 1000)
        self.test = self.process(test, 1000)

        # dump the alphabet
        self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num))
        # create model
        self.segmenter = None
        logging.info("Segmenter Type: {0}".format(self.segmenter_type))
        logging.info("Decoder Type: {0}".format(self.decode_type))

        if self.segmenter_type == TREE:
            self.segmenter = TreeSegmenter(G)
            self.features = TreeFeatures(self.G, 1000)
        elif self.segmenter_type == CHUNK:
            self.segmenter = ChunkSegmenter(G)
            self.features = ChunkFeatures(self.G, 1000)
        else:
            raise Exception('Illicit Model Type')

        # transducer
        self.transducer = TransducerModel(self.train,
                                          self.dev,
                                          self.test,
                                          self.Sigma,
                                          L=self.T_L,
                                          eta=self.T_eta,
                                          C=self.T_C)

        # extract features
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')

        # dimension of data
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d,
                                              L=2,
                                              C=self.S_C,
                                              eta=0.1,
                                              fudge=1e-4)
        self.updater.w[0] = 10
        self.updater.w[1] = 10

    def save_transducer(self, directory, i):
        """ save the transducer weights """
        np.save(directory + "/transducer-{0}-{1}.npy".format(*(self.sig, i)),
                array(self.transducer.updater.w))

    def save_segmenter(self, directory, i):
        np.save(directory + "/segmenter-{0}-{1}.npy".format(*(self.sig, i)),
                array(self.updater.w))

    def save(self, directory):
        self.save_transducer(directory, 'final')
        self.save_segmenter(directory, 'final')

    def optimize(self,
                 t=None,
                 load=False,
                 transducer=None,
                 segmenter=None,
                 iterations=20):
        """ optimize the model """
        if load:
            assert transducer is not None
            #assert segmenter is not None
            self.load_weights(transducer, segmenter)
        if t is None:
            return
        elif t == JOINT:
            self.optimize_joint(iterations)
        elif t == PIPE:
            self.optimize_pipeline(iterations)

    def load_weights(self, transducer, segmenter):
        """ load weights """
        self.transducer.updater.w = np.load(transducer)
        self.updater.w = np.load(segmenter)

    def optimize_pipeline(self, iterations=10):
        """ optimize """

        for i in xrange(iterations):
            self.transducer.optimize(1, i)
            train_acc = self.transducer.evaluate(self.train)
            dev_acc = self.transducer.evaluate(self.dev)
            test_acc = self.transducer.evaluate(self.test)
            logging.info(
                "transducer epoch {0} train acc: {1}".format(*(i, train_acc)))
            logging.info(
                "transducer epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info(
                "transducer epoch {0} test acc: {1}".format(*(i, test_acc)))
            self.save_transducer(self.weights, i)

        print self.transducer.evaluate(self.dev)

        if self.segmenter_type == TREE:
            self.optimize_tree(iterations)
        elif self.segmenter_type == CHUNK:
            self.optimize_chunk(iterations)

    def optimize_chunk(self, iterations):
        """ optimize the model """
        for i in xrange(iterations):
            for tree in iterview(self.train,
                                 colored('Pass %s' % (i + 1), 'blue')):
                gt0, gt, ge = self.features.potentials_catchup(
                    tree, self.updater.w, self.updater)
                dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(
                    ge)
                self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt)
                self.features.update(tree, dge, dgt0, dgt, self.updater)
                self.updater.step += 1

            self.save_segmenter(self.weights, i)
            self.decode(self.train, VITERBI)
            self.decode(self.dev, VITERBI)
            test_acc, test_f1 = self.decode(self.test, VITERBI)
            logging.info("chunk epoch {0} train acc: {1}".format(*(i,
                                                                   train_acc)))
            logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info("chunk epoch {0} test acc: {1}".format(*(i,
                                                                  test_acc)))
            logging.info("chunk epoch {0} train f1: {1}".format(*(i,
                                                                  train_f1)))
            logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1)))
            logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1)))

    def optimize_tree(self, iterations):
        """ optimize the model """
        for i in xrange(iterations):
            for tree in iterview(self.train,
                                 colored('Pass %s' % (i + 1), 'blue')):
                psi = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
                dpsi = self.segmenter.dll(tree, psi)
                self.features.update(tree, dpsi, self.updater)
                self.updater.step += 1

            self.save_segmenter(self.weights, i)
            self.decode(self.train, VITERBI)
            self.decode(self.dev, VITERBI)
            test_acc, test_f1 = self.decode(self.test, VITERBI)
            logging.info("tree epoch {0} train acc: {1}".format(*(i,
                                                                  train_acc)))
            logging.info("tree epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info("tree epoch {0} test acc: {1}".format(*(i, test_acc)))
            logging.info("tree epoch {0} train f1: {1}".format(*(i, train_f1)))
            logging.info("tree epoch {0} dev f1: {1}".format(*(i, dev_f1)))
            logging.info("tree epoch {0} test f1: {1}".format(*(i, test_f1)))

    def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0):
        """ optimize jointly using importance sampling """
        # TODO: unit test
        self.updater.eta = eta1
        for i in xrange(iterations):
            samples = self.transducer.sample(self.train, num=num_samples)
            for tree, sample in iterview(zip(self.train, samples),
                                         colored('Pass %s' % (i + 1), 'blue')):
                # compute approximate partition function
                logZ = NINF
                strings, weights = [], []
                for (ur, count) in sample.items():
                    score = self.transducer.ll(tree, ur)
                    if self.segmenter_type == CHUNK:
                        score += self.score_chunk(tree, ur)
                    elif self.segmenter_type == TREE:
                        score += self.score_tree(tree, ur)
                    # TODO: double check
                    logZ = logaddexp(logZ, score)
                    weights.append(score)
                    strings.append(ur)

                #TODO: make more elegant
                tmp = []
                for weight in weights:
                    tmp.append(weight - logZ)  # TODO: double check
                weights = tmp

                # take a tranducer weight gradient step with the importance sampling
                self.transducer.step_is(tree, strings, weights, eta=eta2)
                # take a segmenter weight gradient step with the importance sampling
                for ur, weight in zip(sample, weights):
                    if self.segmenter_type == CHUNK:
                        self.is_chunk(tree, ur, weight)
                    elif self.segmenter_type == TREE:
                        self.is_tree(tree, ur, weight)
                self.updater.step += 1

    def is_chunk(self, tree, ur, weight):
        """ importance sampling gradient step tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(ge)
        self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt)
        dgt0 *= weight
        dgt *= weight
        dge *= weight
        self.features.update(tree, dge, dgt0, dgt, self.updater)

    def is_tree(self, tree, ur, weight):
        """ importance sampling gradient step chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        dpsi = self.segmenter.dll(tree, psi)
        dpsi *= weight
        self.features.update(tree, dpsi, self.updater)

    def baseline_ur(self, data):
        """ baseline ur """
        for tree in iterview(data, colored('Updating Baseline UR', 'red')):
            tree.ur_samples = []
            tree.ur_samples.append(tree.sr)

    def decode_ur(self, data):
        """ decodes the UR """
        for tree in iterview(data, colored('Updating Viterbi UR', 'red')):
            tree.ur_samples = []
            viterbi_ur = self.transducer.decode(tree)[1]
            tree.ur_samples.append(viterbi_ur)

    def oracle_ur(self, data):
        """ uses the oracle  UR """
        for tree in iterview(data, colored('Updating Oracle UR', 'red')):
            tree.ur_samples = []
            tree.ur_samples.append(tree.ur_gold)

    def sample_ur(self, data, num_samples=1000):
        """ samples the UR """
        samples = self.transducer.sample(data, num=num_samples)
        for tree, samples in iterview(zip(data, samples),
                                      colored('Sampling', 'red')):
            tree.ur_samples = []
            viterbi_ur = self.transducer.decode(tree)[1]
            tree.ur_samples.append(viterbi_ur)
            for sample, count in samples.items():
                tree.ur_samples.append(sample)

    def decode_chunk(self, tree, ur):
        """ decodes a chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        best, segments, labels = self.segmenter.decode(tree, ge, gt0, gt)
        truth = [tree.ur[i:j] for i, j in tree.indices]
        guess = [tree.ur[i:j] for i, j in segments]
        return truth, guess

    def decode_tree(self, tree, ur):
        """ decodes a tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        max_score, tree_string, max_spans = self.segmenter.argmax(
            tree.M, self.G, psi, tree.ur)

        gold_spans = set(tree.spans)
        guess_spans = set(tree.spans)
        p, r = 0.0, 0.0
        for span in gold_spans:
            if span in guess_spans:
                p += 1.0
        p /= len(gold_spans)
        for span in guess_spans:
            if span in gold_spans:
                r += 1.0
        r /= len(guess_spans)
        f1 = (2 * p * r) / (p + r)

        # TODO: horrible hack
        segmentation = tree_string.replace("(",
                                           "").replace(")",
                                                       "").replace(" ", "")
        for i in xrange(100):
            segmentation = segmentation.replace(str(i), "")
        segmentation = segmentation.split(":")
        guess = segmentation[:-1]
        truth = [x[0] for x in to_segmentation(tree.root)]
        return truth, guess, f1

    def score_chunk(self, tree, ur):
        """ scores a chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        M = tree.M
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        return self.segmenter.logZ(ge, gt0, gt, M)

    def score_tree(self, tree, ur):
        """ scores a tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        M = tree.M
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        return self.segmenter.logZ(M, self.G, psi)

    def decode(self, data, data_type, decode_type=None, sep=u"#"):
        """ decode the chunker """
        if decode_type is None:
            decode_type = self.decode_type

        if decode_type == ORACLE:
            self.oracle_ur(data)
        elif decode_type == BASELINE:
            self.baseline_ur(data)
        elif decode_type == VITERBI:
            self.decode_ur(data)
        elif decode_type == SAMPLE:
            self.sample_ur(data)
        else:
            raise Exception('Illicit Decode Type')

        ur_correct, ur_total = 0, 0
        correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0
        for tree in iterview(data, colored('Decoding', 'red')):
            max_ur, max_score = None, NINF
            counter = 0
            for ur in tree.ur_samples:
                tree.update_ur(ur)
                counter += 1
                score = 0.0
                score = self.transducer.ll(tree, ur)
                #print
                #print "LL", self.transducer.ll(tree, ur)
                if self.segmenter_type == CHUNK:
                    score += self.score_chunk(tree, ur)
                    #print "SCORE", self.score_chunk(tree, ur)
                    #print ur
                    #raw_input()
                elif self.segmenter_type == TREE:
                    score += self.score_tree(tree, ur)
                # take the best importance sample
                if score >= max_score:
                    max_score = score
                    max_ur = ur
                    #print "counter", counter
            if max_ur == tree.ur_gold:
                ur_correct += 1
            ur_total += 1
            truth, guess, tree_f1_tmp = None, None, None
            if self.segmenter_type == CHUNK:
                truth, guess = self.decode_chunk(tree, max_ur)
            elif self.segmenter_type == TREE:
                truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur)
                tree_f1 += tree_f1_tmp

            # ACCURACY
            if truth == guess:
                correct += 1
            # LEVENSHTEIN
            lev += Levenshtein.distance(sep.join(truth), sep.join(guess))
            # F1
            set1, set2 = set(guess), set(truth)
            p, r = 0, 0
            for e in set1:
                if e in set2:
                    p += 1
            for e in set2:
                if e in set1:
                    r += 1
            p /= len(set1)
            r /= len(set2)
            if p + r > 0:
                f1 += 2 * p * r / (p + r)
            total += 1

        logging.info("decoder type: {0}".format(decode_type))
        logging.info("{0} ur acc: {1}".format(*(data_type,
                                                ur_correct / total)))
        logging.info("{0} seg acc: {1}".format(*(data_type, correct / total)))
        logging.info("{0} f1: {1}".format(*(data_type, f1 / total)))
        logging.info("{0} edit: {1}".format(*(data_type, lev / total)))
        if self.segmenter_type == TREE:
            logging.info("{0} tree f1: {1}".format(*(data_type,
                                                     tree_f1 / total)))

    def process(self, fname, maximum=100):
        """ 
        Put the string data into the data structures
        necessary for training and decoding the model. 
        """
        processed = []
        data = Data(fname)
        for counter, (sr, (tree, (indices, index_labels))) in enumerate(data):
            if counter == maximum:
                break
            ur = to_string(tree)
            for s in list(sr):
                self.Sigma.add(s)
            for s in list(ur):
                self.Sigma.add(s)

            spans, labels = [], []
            for node in walk(tree):
                spans.append((node.i, node.j))
                labels.append(node.label)
            t = Tree(self.G, sr, ur, spans, labels, indices, index_labels,
                     len(spans), tree)
            processed.append(t)

        return processed
Пример #6
0
class PTB(object):
    "Load the POS-tagged Penn Treebank."

    def __init__(self, base, coarse=True):
        self.base = base
        self.coarse = coarse
        self.Y = Alphabet()   # tag set
        self.V, self.V_freq = Alphabet(), {} # vocabulary
        self.V2Y, self.Y2V = dd(set), dd(set)
        self.train, self.dev, self.test = [], [], []
        self.prefixes, self.suffixes = {}, {}
        self.prefix2int, self.suffix2int = {}, {}

        # Read data and create standard splits according to
        # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art)
        #
        # train split [0,18]
        for sectionid in range(19):
            read = self.read_section(sectionid)
            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.train.append(sentence)
                for y, w in sentence:
                    self.V.add(w)
                    self.V2Y[w].add(self.Y.lookup(y))
                    self.Y2V[self.Y.lookup(y)].add(w)
                    if w not in self.V_freq:
                        self.V_freq[w] = 0
                    self.V_freq[w] += 1
                    for prefix in self.extract_prefixes(w):
                        if prefix not in self.prefixes:
                            self.prefixes[prefix] = 0
                        self.prefixes[prefix] += 1
                    for suffix in self.extract_suffixes(w):
                        if suffix not in self.suffixes:
                            self.suffixes[suffix] = 0
                        self.suffixes[suffix] += 1

        # dev split [19,21]
        for sectionid in range(19, 22):
            read = self.read_section(sectionid)

            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.dev.append(sentence)

        # test split [22,24]
        for sectionid in range(22, 25):
            #for tag, word in sentence:
            #    if tag == self.Y["BAD"]:
            #        break

            self.test.extend(self.read_section(sectionid))
        self.Y.freeze()

    def extract_prefixes(self, w, n=10):
        """ gets prefixes up to length n """
        prefixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[:i]
            if segment not in self.prefix2int:
                self.prefix2int[segment] = len(self.prefix2int)
            prefixes.append(w[:i])
        return prefixes

    def extract_suffixes(self, w, n=10):
        """ gets suffixes up to lenght n """
        suffixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[-i:]
            if segment not in self.suffix2int:
                self.suffix2int[segment] = len(self.suffix2int)
            suffixes.append(w[-i:])
        return suffixes

    def tag_bigrams(self):
        """ extract all tag bigrams """
        bigram2count = {}
        for sentence in self.train:
            for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]):
                key = (tag1, tag2)
                if key not in bigram2count:
                    bigram2count[key] = 0
                bigram2count[key] += 1
        return bigram2count

    def read_section(self, sectionid):
        "Read a section number `sectionid` from the PTB."
        root = os.path.join(self.base, str(sectionid).zfill(2))
        for fname in os.listdir(root):
            if not fname.endswith('pos.gz'):
                continue
            with gzip.open(os.path.join(root, fname), 'rb') as f:
                for chunk in f.read().split('======================================'):
                    if chunk.strip():
                        if self.coarse:
                            # STUPID BIO ENCODING
                            #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)]
                            # Note: clean up punc reduction
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)]
                        else:
                            # TODO: what to do able bars in the tags?
                            # FIND OUT AND CLEAN UP
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)]

    def pp(self, sentence):
        "Pretty print."
        return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
Пример #7
0
class Lexicon(object):
    """ Reads in the universal morpholigcal lexicon """

    def __init__(self, fin, atts, vals, av, avs):
        # probably redundant...
        # but not optimizing for space so who cares
        self.atts, self.vals, self.av, self.avs = atts, vals, av, avs

        self.lexicon = dd(list)
        self.words = Alphabet()
        
        with codecs.open(fin, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":
                    continue
                word, lemma, tags = line.split(" ")
                self.words.add(word)
                tags = tags.split(",")

                for tag in tags:
                    if len(tag.split("=")) != 2:
                        print line
                        print tag
                    a, v = tag.split("=")
                    self.av[a].add(v)

                    self.atts.add(a)
                    self.vals.add(v)

                self.lexicon[word].append((lemma, tags))

        # get rid of default dict wrapper
        self.lexicon = dict(self.lexicon)
        self.av = dict(self.av)
        
        for a, s in self.av.items():
            for v in s:
                self.avs.add((a, v))

    def create_vectors(self):
        self.N = len(self.avs)
        self.W = zeros((len(self.lexicon), self.N))
        
        # use Manaal's encoding (http://arxiv.org/abs/1512.05030)
        for w, lst in self.lexicon.items():
            vec = zeros((self.N))
            for l, ts in lst:
                for tag in ts:
                    a, v = tag.split("=")

                    #if a != "pos":
                    #    continue
                    
                    j = self.avs[(a, v)]
                    vec[j] = 1.0
            i = self.words[w]
            self.W[i] = vec

        
    def pp(self, word):
        """ pretty print the morphological tag of a word """
        i = self.words[word]
        lst = []
        for n in xrange(self.N):
            if self.W[i, n] > 0:
                lst.append("=".join(self.avs.lookup(n)))
        return word, ",".join(lst)

    
    def __getitem__(self, word):
        i = self.words[word]
        return self.W[i]
Пример #8
0
class TransducerModel(object):
    """ Transducer model """

    def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001):
        self.train = train
        self.dev = dev
        self.test = test
        self.Sigma = Sigma
        assert self.Sigma[""] == 0
        self.IL = IL
        self.C = C
        self.L = L
        self.eta = eta

        # X and Y
        self.X, self.Y = Alphabet(), Alphabet()
        self.X.add(""); self.Y.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.X.add(s)
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.Y.add(s)
        self.X.freeze(); self.Y.freeze()

        # first order (possibly extend)
        self.P = Alphabet()
        self.P.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.P.add(s)
        self.P.add("oo")
        self.P.add("nn")
        self.P.add("yy")
        self.P.add("ss")
        self.P.add("ee")
        self.P.freeze()
        
        # create Z
        self.Z = Alphabet()
        self.Z[""] = 0
        for p, pi in self.P.items():
            for o, oi in self.Y.items():
                 z = p+o
                 self.Z.add(z)
        self.Z.freeze()
        
        # model
        self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL)
        self.features = TransducerFeatures(self.X, self.Y, self.P)
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')
        
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4)
        self.updater.w[0] = 10.0
        self.updater.w[1] = -10.0

    def optimize(self, iterations=10, start=0):
        """ optimize the model  """
        #np.random.shuffle(self.train)
        for i in xrange(iterations):
            for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')):
                psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
                dpsi = zeros_like(psi)
                x, y = instance.sr, instance.ur
                #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY)
                dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY)
                self.features.update(instance, dpsi, self.updater)
                self.updater.step += 1

    def step_is(self, tree, strings, weights, eta=0.0):
        """ optimize the model  """
        self.updater.eta = eta
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        dpsi = zeros_like(psi)
        dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY)
        self.features.update(tree, dpsi, self.updater)
        self.updater.step += 1
                
    def sample(self, data, num=1000):
        """ sample """
        samples = []
        inside = 0
        correct1, correct2, total = 0, 0, 0
        for instance in iterview(data, colored('Sampling', 'green')):
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            sr = instance.sr
            dist = {}
            for s in self.model.sample(sr, psi, num=num):
                output = ""
                for x, y in s:
                    output += y
                if output not in dist:
                    dist[output] = 0
                dist[output] += 1

            count = dist[instance.ur_gold] if instance.ur_gold in dist else 0
            decoded = self.decode(instance)[1]

            if decoded != instance.ur_gold and count > 0:
                inside += 1
            if decoded == instance.ur_gold:
                correct1 += 1
            if instance.ur_gold in dist:
                correct2 += 1
            total += 1
            samples.append(dist)
            
        # TODO: put into log
        #print ; print inside
        #print correct1 / total, correct2 / total
        return samples

    def decode(self, instance):
        """ Decodes an instance """
        psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
        ur1 = instance.ur
        results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
        return results
        
    def evaluate(self, data, maximum=100000000):
        """ decode the model """
        correct, total = 0, 0
        counter = 0
        for instance in iterview(data, colored('Decoding', 'red')):
            if counter == maximum:
                break
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            ur1 = instance.ur
            results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
            ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
            score, ur2 = results[0], results[1]
            if ur1 == ur2:
                correct += 1
            print ur1, ur2
            total += 1
            counter += 1
        print
        return float(correct) / total

    def ll(self, tree, ur):
        """ gets the log-likelihood """
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
Пример #9
0
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            ur1 = instance.ur
            results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
            ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
            score, ur2 = results[0], results[1]
            if ur1 == ur2:
                correct += 1
            print ur1, ur2
            total += 1
            counter += 1
        print
        return float(correct) / total

    def ll(self, tree, ur):
        """ gets the log-likelihood """
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
    
if __name__ == "__main__":
    data = [("hablar", "hablando"), ("comer", "comiendo")]
    Sigma = Alphabet()
    Sigma.add("")
    for (x, y) in data:
        for c in list(x):
            Sigma.add(c)
        for c in list(y):
            Sigma.add(c)
            
    tm = TransductionModel(Sigma, data)
    profile.runctx("tm.train()", locals(), globals())