def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)
        A.add_many(f for token in x.sequence for f in token.attributes)
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 2
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)
        A.add_many(f for token in x.sequence for f in token.attributes)
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 3
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)  # add labels to label domain
        # extract features of the target path
        F = x.F
        path = x.truth
        A.add_many(F(0, None, path[0]))
        A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t]))
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 4
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)  # add labels to label domain
        # extract features of the target path
        F = x.F
        path = x.truth
        A.add_many(F(0, None, path[0]))
        A.add_many(k for t in xrange(1, x.N)
                   for k in F(t, path[t - 1], path[t]))
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Exemplo n.º 5
0
class PTB(object):
    "Load the POS-tagged Penn Treebank."

    def __init__(self, base, coarse=True):
        self.base = base
        self.coarse = coarse
        self.Y = Alphabet()   # tag set
        self.V, self.V_freq = Alphabet(), {} # vocabulary
        self.V2Y, self.Y2V = dd(set), dd(set)
        self.train, self.dev, self.test = [], [], []
        self.prefixes, self.suffixes = {}, {}
        self.prefix2int, self.suffix2int = {}, {}

        # Read data and create standard splits according to
        # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art)
        #
        # train split [0,18]
        for sectionid in range(19):
            read = self.read_section(sectionid)
            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.train.append(sentence)
                for y, w in sentence:
                    self.V.add(w)
                    self.V2Y[w].add(self.Y.lookup(y))
                    self.Y2V[self.Y.lookup(y)].add(w)
                    if w not in self.V_freq:
                        self.V_freq[w] = 0
                    self.V_freq[w] += 1
                    for prefix in self.extract_prefixes(w):
                        if prefix not in self.prefixes:
                            self.prefixes[prefix] = 0
                        self.prefixes[prefix] += 1
                    for suffix in self.extract_suffixes(w):
                        if suffix not in self.suffixes:
                            self.suffixes[suffix] = 0
                        self.suffixes[suffix] += 1

        # dev split [19,21]
        for sectionid in range(19, 22):
            read = self.read_section(sectionid)

            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.dev.append(sentence)

        # test split [22,24]
        for sectionid in range(22, 25):
            #for tag, word in sentence:
            #    if tag == self.Y["BAD"]:
            #        break

            self.test.extend(self.read_section(sectionid))
        self.Y.freeze()

    def extract_prefixes(self, w, n=10):
        """ gets prefixes up to length n """
        prefixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[:i]
            if segment not in self.prefix2int:
                self.prefix2int[segment] = len(self.prefix2int)
            prefixes.append(w[:i])
        return prefixes

    def extract_suffixes(self, w, n=10):
        """ gets suffixes up to lenght n """
        suffixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[-i:]
            if segment not in self.suffix2int:
                self.suffix2int[segment] = len(self.suffix2int)
            suffixes.append(w[-i:])
        return suffixes

    def tag_bigrams(self):
        """ extract all tag bigrams """
        bigram2count = {}
        for sentence in self.train:
            for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]):
                key = (tag1, tag2)
                if key not in bigram2count:
                    bigram2count[key] = 0
                bigram2count[key] += 1
        return bigram2count

    def read_section(self, sectionid):
        "Read a section number `sectionid` from the PTB."
        root = os.path.join(self.base, str(sectionid).zfill(2))
        for fname in os.listdir(root):
            if not fname.endswith('pos.gz'):
                continue
            with gzip.open(os.path.join(root, fname), 'rb') as f:
                for chunk in f.read().split('======================================'):
                    if chunk.strip():
                        if self.coarse:
                            # STUPID BIO ENCODING
                            #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)]
                            # Note: clean up punc reduction
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)]
                        else:
                            # TODO: what to do able bars in the tags?
                            # FIND OUT AND CLEAN UP
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)]

    def pp(self, sentence):
        "Pretty print."
        return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
Exemplo n.º 6
0
class TransducerModel(object):
    """ Transducer model """

    def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001):
        self.train = train
        self.dev = dev
        self.test = test
        self.Sigma = Sigma
        assert self.Sigma[""] == 0
        self.IL = IL
        self.C = C
        self.L = L
        self.eta = eta

        # X and Y
        self.X, self.Y = Alphabet(), Alphabet()
        self.X.add(""); self.Y.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.X.add(s)
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.Y.add(s)
        self.X.freeze(); self.Y.freeze()

        # first order (possibly extend)
        self.P = Alphabet()
        self.P.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.P.add(s)
        self.P.add("oo")
        self.P.add("nn")
        self.P.add("yy")
        self.P.add("ss")
        self.P.add("ee")
        self.P.freeze()
        
        # create Z
        self.Z = Alphabet()
        self.Z[""] = 0
        for p, pi in self.P.items():
            for o, oi in self.Y.items():
                 z = p+o
                 self.Z.add(z)
        self.Z.freeze()
        
        # model
        self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL)
        self.features = TransducerFeatures(self.X, self.Y, self.P)
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')
        
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4)
        self.updater.w[0] = 10.0
        self.updater.w[1] = -10.0

    def optimize(self, iterations=10, start=0):
        """ optimize the model  """
        #np.random.shuffle(self.train)
        for i in xrange(iterations):
            for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')):
                psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
                dpsi = zeros_like(psi)
                x, y = instance.sr, instance.ur
                #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY)
                dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY)
                self.features.update(instance, dpsi, self.updater)
                self.updater.step += 1

    def step_is(self, tree, strings, weights, eta=0.0):
        """ optimize the model  """
        self.updater.eta = eta
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        dpsi = zeros_like(psi)
        dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY)
        self.features.update(tree, dpsi, self.updater)
        self.updater.step += 1
                
    def sample(self, data, num=1000):
        """ sample """
        samples = []
        inside = 0
        correct1, correct2, total = 0, 0, 0
        for instance in iterview(data, colored('Sampling', 'green')):
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            sr = instance.sr
            dist = {}
            for s in self.model.sample(sr, psi, num=num):
                output = ""
                for x, y in s:
                    output += y
                if output not in dist:
                    dist[output] = 0
                dist[output] += 1

            count = dist[instance.ur_gold] if instance.ur_gold in dist else 0
            decoded = self.decode(instance)[1]

            if decoded != instance.ur_gold and count > 0:
                inside += 1
            if decoded == instance.ur_gold:
                correct1 += 1
            if instance.ur_gold in dist:
                correct2 += 1
            total += 1
            samples.append(dist)
            
        # TODO: put into log
        #print ; print inside
        #print correct1 / total, correct2 / total
        return samples

    def decode(self, instance):
        """ Decodes an instance """
        psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
        ur1 = instance.ur
        results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
        return results
        
    def evaluate(self, data, maximum=100000000):
        """ decode the model """
        correct, total = 0, 0
        counter = 0
        for instance in iterview(data, colored('Decoding', 'red')):
            if counter == maximum:
                break
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            ur1 = instance.ur
            results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
            ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
            score, ur2 = results[0], results[1]
            if ur1 == ur2:
                correct += 1
            print ur1, ur2
            total += 1
            counter += 1
        print
        return float(correct) / total

    def ll(self, tree, ur):
        """ gets the log-likelihood """
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)