Пример #1
0
    def __init__(self, base, coarse=True):
        self.base = base
        self.coarse = coarse
        self.Y = Alphabet()  # tag set
        self.V, self.V_freq = Alphabet(), {}  # vocabulary
        self.V2Y, self.Y2V = dd(set), dd(set)
        self.train, self.dev, self.test = [], [], []
        self.prefixes, self.suffixes = {}, {}
        self.prefix2int, self.suffix2int = {}, {}

        # Read data and create standard splits according to
        # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art)
        #
        # train split [0,18]
        for sectionid in xrange(19):
            read = self.read_section(sectionid)
            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.train.append(sentence)
                for y, w in sentence:
                    self.V.add(w)
                    self.V2Y[w].add(self.Y.lookup(y))
                    self.Y2V[self.Y.lookup(y)].add(w)
                    if w not in self.V_freq:
                        self.V_freq[w] = 0
                    self.V_freq[w] += 1
                    for prefix in self.extract_prefixes(w):
                        if prefix not in self.prefixes:
                            self.prefixes[prefix] = 0
                        self.prefixes[prefix] += 1
                    for suffix in self.extract_suffixes(w):
                        if suffix not in self.suffixes:
                            self.suffixes[suffix] = 0
                        self.suffixes[suffix] += 1

        # dev split [19,21]
        for sectionid in xrange(19, 22):
            read = self.read_section(sectionid)

            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.dev.append(sentence)

        # test split [22,24]
        for sectionid in xrange(22, 25):
            #for tag, word in sentence:
            #    if tag == self.Y["BAD"]:
            #        break

            self.test.extend(self.read_section(sectionid))
        self.Y.freeze()
Пример #2
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """
    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data]
    return (L, F, I)
Пример #3
0
 def __init__(self, filename):
     self.Y = Alphabet()
     data = list(
         fromSGML(filename, linegrouper="<NEW.*?>", bioencoding=False))
     np.random.shuffle(data)
     super(CoraCitations, self).__init__(train=data[len(data) // 5:],
                                         dev=data[:len(data) // 5],
                                         test=[])
     self.train = self.make_instances('train', Instance)
     self.dev = self.make_instances('dev', Instance)
Пример #4
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """
    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32))
         for label, features in data]
    return (L, F, I)
Пример #5
0
class Dataset(object):

    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test
        # indexes will be populated by `_index`.
        self.Y = Alphabet()          # tag set
        self.V = Alphabet()          # vocabulary
        self.V_freq = Counter()      # token unigram counts
        self.V2Y = defaultdict(set)  # tag dictionary
        self.prefixes = Counter()
        self.suffixes = Counter()
        self._index(self.train)

    def _index(self, data):
        "frequency tables, etc."
        for sentence in data:
            for y, w in sentence:
                self.Y.add(y)
                self.V.add(w)
                self.V2Y[w].add(y)
                self.V_freq[w] += 1
                for prefix in prefixes(w):
                    self.prefixes[prefix] += 1
                for suffix in suffixes(w):
                    self.suffixes[suffix] += 1

    def make_instances(self, fold, cls):
        "Convert tuples in data `fold` to instances of `cls`."
        data = []
        for x in iterview(getattr(self, fold), msg='Features (%s)' % fold):
            tags, tokens = zip(*x)
            data.append(cls(tokens, self.Y.map(tags), self))
        return data

    def tag_ngram_counts(self, n):
        "Returns tag ngram count for subsequences of length n."

#        Y = self.Y

        def tag_sequences():
            """Iterate over tag sequence (as `str` instead of `int`, which is how they are
            stored.).

            """
            for e in self.train:
                y, _ = zip(*e)
#                assert all(isinstance(yy, int) for yy in y), y
#                yield tuple(Y.lookup_many(y))
                yield y

        return ngram_counts(tag_sequences(), n)
Пример #6
0
 def __init__(self, train, dev, test):
     self.train = train
     self.dev = dev
     self.test = test
     # indexes will be populated by `_index`.
     self.Y = Alphabet()  # tag set
     self.V = Alphabet()  # vocabulary
     self.V_freq = Counter()  # token unigram counts
     self.V2Y = defaultdict(set)  # tag dictionary
     self.prefixes = Counter()
     self.suffixes = Counter()
     self._index(self.train)
Пример #7
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)
        A.add_many(f for token in x.sequence for f in token.attributes)
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Пример #8
0
def preprocess_bubs_format(bubs, output):
    """Convert grammar from bubs-parser into ldp-friendly csv format. The result is
    an equivalent grammar, which is much faster to load because it has been
    integerized.

    Given a gzipped grammar from bubs-parser, e.g. `eng.M2.gr.gz`, this function
    will generate four files:

    - eng.M2.gr.csv: grammar rules
    - eng.M2.lex.csv: lexical rules
    - eng.M2.lex.alphabet: mapping from terminals to integers
    - eng.M2.sym.alphabet: mapping from syms to integers

    """

    sym = Alphabet()
    lex = Alphabet()

    import gzip
    lines = gzip.open(bubs, 'rb').readlines()
    reading_lex = False

    l = []
    f = []
    for line in iterview(lines[1:]):  # drop first line

        if line.startswith('===== LEXICON'):
            reading_lex = True
            continue

        x = line.strip().split()
        if not x:
            continue

        lhs = x[0]
        rhs = tuple(b for b in x[2:-1])
        score = x[-1]
        if len(rhs) == 1:
            rhs = (rhs[0], '')
        y, z = rhs
        lhs = sym[lhs]

        y = lex[y] if reading_lex else sym[y]
        z = sym[z] if z else -1

        if reading_lex:
            l.append({'score': score, 'head': lhs, 'left': y})
        else:
            f.append({'score': score, 'head': lhs, 'left': y, 'right': z})

    # non-gzipped loads faster.
    #DataFrame(f).to_csv(gzip.open(output + '.gr.csv.gz', 'wb'))
    #DataFrame(l).to_csv(gzip.open(output + '.lex.csv.gz', 'wb'))

    DataFrame(f).to_csv(output + '.gr.csv')
    DataFrame(l).to_csv(output + '.lex.csv')
    sym.save(output + '.sym.alphabet')
    lex.save(output + '.lex.alphabet')
Пример #9
0
class CoraCitations(Dataset):
    def __init__(self, filename):
        self.Y = Alphabet()
        data = list(
            fromSGML(filename, linegrouper="<NEW.*?>", bioencoding=False))
        np.random.shuffle(data)
        super(CoraCitations, self).__init__(train=data[len(data) // 5:],
                                            dev=data[:len(data) // 5],
                                            test=[])
        self.train = self.make_instances('train', Instance)
        self.dev = self.make_instances('dev', Instance)

    def evaluate(self, predict, data, name, verbosity=1):
        if not data:
            return
        if verbosity:
            print()
            print('Phrase-based F1:', name)
        f1 = F1()
        for i, x in enumerate(iterview(data, msg='Eval %s' % name)):
            pred = extract_contiguous(predict(x))
            gold = extract_contiguous(self.Y.lookup_many(x.tags))
            # (i,begin,end) uniquely identifies the span
            for (label, begins, ends) in gold:
                f1.add_relevant(label, (i, begins, ends))
            for (label, begins, ends) in pred:
                f1.add_retrieved(label, (i, begins, ends))
        if verbosity:
            print()
        return f1.scores(verbose=verbosity >= 1)
Пример #10
0
    def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001):
        self.train = train
        self.dev = dev
        self.test = test
        self.Sigma = Sigma
        assert self.Sigma[""] == 0
        self.IL = IL
        self.C = C
        self.L = L
        self.eta = eta

        # X and Y
        self.X, self.Y = Alphabet(), Alphabet()
        self.X.add(""); self.Y.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.X.add(s)
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.Y.add(s)
        self.X.freeze(); self.Y.freeze()

        # first order (possibly extend)
        self.P = Alphabet()
        self.P.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.P.add(s)
        self.P.add("oo")
        self.P.add("nn")
        self.P.add("yy")
        self.P.add("ss")
        self.P.add("ee")
        self.P.freeze()
        
        # create Z
        self.Z = Alphabet()
        self.Z[""] = 0
        for p, pi in self.P.items():
            for o, oi in self.Y.items():
                 z = p+o
                 self.Z.add(z)
        self.Z.freeze()
        
        # model
        self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL)
        self.features = TransducerFeatures(self.X, self.Y, self.P)
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')
        
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4)
        self.updater.w[0] = 10.0
        self.updater.w[1] = -10.0
Пример #11
0
 def __init__(self, fin, source_lang, target_lang):
     # variables
     self.source_lang = source_lang
     self.target_lang = target_lang
     
     # intern the variables
     self.source = Alphabet()
     self.target = Alphabet()
     
     self.store = {}
     with codecs.open(fin, encoding="utf-8") as f:
         for line in f:
             line = line.strip()
             source, target, score = line.split(" ")
             #if "Buch" in source or "Stuhl" in source:
             score = float(score)
             self.store[(source, target)] = score
             self.source.add(source)
             self.target.add(target)
Пример #12
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k,v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data]
    return (L, F, I)
Пример #13
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k, v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32))
         for label, features in data]
    return (L, F, I)
Пример #14
0
def preprocess_berkeley_format(input_prefix, output, coarsen=False):
    """
    Preprocessing: convert PTB grammar into simple tsv format.
    """
    def g(x):
        if coarsen:
            x = x.split('^')[0]
            x = x.split('_')[0]
        return x

    sym = Alphabet()
    lex = Alphabet()

    lexical_rules = []
    for x in file(input_prefix + '.lexicon'):
        [(x, y, s)] = re.findall(r'(\S+)\s+(\S+)\s*\[(.*?)\]', x)
        s = float(s)
        x = g(x)
        y = g(y)
        lexical_rules.append({'score': log(s), 'head': sym[x], 'left': lex[y]})

    rules = []
    for x in file(input_prefix + '.grammar'):
        x, y = x.split(' -> ')
        y = y.split()
        if len(y) == 2:
            y, s = y
            s = float(s)
            z = -1
        else:
            assert len(y) == 3
            y, z, s = y
            s = float(s)
        x = g(x)
        y = g(y)
        if x == y and z == -1:
            continue
        x = sym[x]
        y = sym[y]
        if z != -1:
            z = g(z)
            z = sym[z]
        rules.append({'score': log(s), 'head': x, 'left': y, 'right': z})

    DataFrame(rules).to_csv(output + '.gr.csv')
    DataFrame(lexical_rules).to_csv(output + '.lex.csv')
    sym.save(output + '.sym.alphabet')
    lex.save(output + '.lex.alphabet')
    def vectorize(self, segmentations, maxn=200, threshold=5):
        """ vectorize to features for theano """

        lookup = {'11': 0, '22': 1}
        index = Alphabet()
        count = dd(int)
        for segmentation in segmentations:
            for segment in segmentation:
                if segment not in lookup:
                    lookup[segment] = len(lookup)
                count[segment] += 1
        # create vectors
        self.N = threshold
        for k, v in count.items():
            if v > threshold:
                index.add(k)
                self.N += 1
        seg2vec = {}
        for seg, i in lookup.items():
            if i < 2:
                continue
            vec = zeros((self.N))
            if (self.d.check(seg)
                    or self.d.check(seg.title())) and len(seg) > 3:
                vec[0] = 1.0
            elif len(seg) > 3:
                vec[1] = 1.0
            if count[seg] > threshold:
                vec[index[seg] + 2] = 1.0
            seg2vec[seg] = vec

        # segmentation2vec
        self.segmentation2vec = {}
        for segmentation in segmentations:
            f = zeros((self.N))
            for segment in segmentation:
                f += seg2vec[segment]
            self.segmentation2vec[' '.join(segmentation)] = f
Пример #16
0
class StringFeatures(object):
    """ String features """
    
    def __init__(self, prefix_length=0, suffix_length=4):
        self.prefix_length, self.suffix_length = prefix_length, suffix_length
        self.attributes = Alphabet()
        self.word2attributes = {}
        self.words = Alphabet()

        
    def get_attributes(self, word, extract=False):
        """ extract the features """
        
        lst = []
        for i in xrange(1, self.prefix_length+1):
            if i > len(word):
                break
            prefix = word[:i]
            name = "PREFIX: "+prefix
            if extract:
                self.attributes.add(name)
            if name in self.attributes:
                lst.append(self.attributes[name])
            
        for i in xrange(1, self.suffix_length+1):
            if i < 0:
                break
            suffix = word[-i:]
            name = "SUFFIX: "+suffix
            if extract:
                self.attributes.add(name)
            if name in self.attributes:
                lst.append(self.attributes[name])

        return lst
                

    def store(self, word):
        """ store the features """

        self.words.add(word)
        i = self.words[word] 
        self.word2attributes[i] = self.get_attributes(word, True)

        
    def __len__(self):
        return len(self.attributes)

    
    def __getitem__(self, word):
        if word in self.words:
            i = self.words[word]
            return self.word2attributes[i]
        # don't extract
        return self.get_attributes(word, False)
Пример #17
0
    def __init__(self, fin, atts, vals, av, avs):
        # probably redundant...
        # but not optimizing for space so who cares
        self.atts, self.vals, self.av, self.avs = atts, vals, av, avs

        self.lexicon = dd(list)
        self.words = Alphabet()
        
        with codecs.open(fin, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":
                    continue
                word, lemma, tags = line.split(" ")
                self.words.add(word)
                tags = tags.split(",")

                for tag in tags:
                    if len(tag.split("=")) != 2:
                        print line
                        print tag
                    a, v = tag.split("=")
                    self.av[a].add(v)

                    self.atts.add(a)
                    self.vals.add(v)

                self.lexicon[word].append((lemma, tags))

        # get rid of default dict wrapper
        self.lexicon = dict(self.lexicon)
        self.av = dict(self.av)
        
        for a, s in self.av.items():
            for v in s:
                self.avs.add((a, v))
Пример #18
0
class Segmenter(object):
    """ Segmenter """
    def __init__(self,
                 train,
                 dev,
                 test,
                 decode_type,
                 split_num,
                 log_fname=None,
                 segmenter_type='tree',
                 G=3,
                 weights='weights',
                 alphabet=None,
                 T_L=2,
                 T_eta=1.0,
                 T_C=0.0000001,
                 S_L=2,
                 S_eta=1.0,
                 S_C=0.00000001):
        # set up the logging system
        log = logging.getLogger('')
        log.setLevel(logging.INFO)
        format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s")

        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(format)
        log.addHandler(ch)

        fh = logging.handlers.RotatingFileHandler(log_fname,
                                                  maxBytes=(1048576 * 5),
                                                  backupCount=7)
        fh.setFormatter(format)
        log.addHandler(fh)

        self.G = G
        self.split_num = split_num
        self.segmenter_type = segmenter_type
        self.decode_type = decode_type
        self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C
        self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C
        self.sig = "split={0},L={1},C={2},eta={3},type={4}".format(
            *(self.split_num, self.T_L, self.T_C, self.T_eta,
              self.decode_type))
        logging.info("Transducer Regularizer Type: L={0}".format(T_L))
        logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C))
        logging.info("Transducer Learning Rate: eta={0}".format(T_eta))
        logging.info("Segmenter Regularizer Type: L={0}".format(S_L))
        logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C))
        logging.info("Segmenter Learning Rate: eta={0}".format(S_eta))

        self.weights = weights
        self.Sigma = Alphabet()
        self.Sigma.add("")  # add epsilon at 0
        self.Sigma.add("o")
        self.Sigma.add("n")
        self.Sigma.add("y")
        self.Sigma.add("s")
        self.Sigma.add("e")

        if alphabet is not None:
            self.Sigma = self.Sigma.load(alphabet)

        # processs the data
        # TODO: modularize
        self.train = self.process(train, 100000)
        self.dev = self.process(dev, 1000)
        self.test = self.process(test, 1000)

        # dump the alphabet
        self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num))
        # create model
        self.segmenter = None
        logging.info("Segmenter Type: {0}".format(self.segmenter_type))
        logging.info("Decoder Type: {0}".format(self.decode_type))

        if self.segmenter_type == TREE:
            self.segmenter = TreeSegmenter(G)
            self.features = TreeFeatures(self.G, 1000)
        elif self.segmenter_type == CHUNK:
            self.segmenter = ChunkSegmenter(G)
            self.features = ChunkFeatures(self.G, 1000)
        else:
            raise Exception('Illicit Model Type')

        # transducer
        self.transducer = TransducerModel(self.train,
                                          self.dev,
                                          self.test,
                                          self.Sigma,
                                          L=self.T_L,
                                          eta=self.T_eta,
                                          C=self.T_C)

        # extract features
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')

        # dimension of data
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d,
                                              L=2,
                                              C=self.S_C,
                                              eta=0.1,
                                              fudge=1e-4)
        self.updater.w[0] = 10
        self.updater.w[1] = 10

    def save_transducer(self, directory, i):
        """ save the transducer weights """
        np.save(directory + "/transducer-{0}-{1}.npy".format(*(self.sig, i)),
                array(self.transducer.updater.w))

    def save_segmenter(self, directory, i):
        np.save(directory + "/segmenter-{0}-{1}.npy".format(*(self.sig, i)),
                array(self.updater.w))

    def save(self, directory):
        self.save_transducer(directory, 'final')
        self.save_segmenter(directory, 'final')

    def optimize(self,
                 t=None,
                 load=False,
                 transducer=None,
                 segmenter=None,
                 iterations=20):
        """ optimize the model """
        if load:
            assert transducer is not None
            #assert segmenter is not None
            self.load_weights(transducer, segmenter)
        if t is None:
            return
        elif t == JOINT:
            self.optimize_joint(iterations)
        elif t == PIPE:
            self.optimize_pipeline(iterations)

    def load_weights(self, transducer, segmenter):
        """ load weights """
        self.transducer.updater.w = np.load(transducer)
        self.updater.w = np.load(segmenter)

    def optimize_pipeline(self, iterations=10):
        """ optimize """

        for i in xrange(iterations):
            self.transducer.optimize(1, i)
            train_acc = self.transducer.evaluate(self.train)
            dev_acc = self.transducer.evaluate(self.dev)
            test_acc = self.transducer.evaluate(self.test)
            logging.info(
                "transducer epoch {0} train acc: {1}".format(*(i, train_acc)))
            logging.info(
                "transducer epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info(
                "transducer epoch {0} test acc: {1}".format(*(i, test_acc)))
            self.save_transducer(self.weights, i)

        print self.transducer.evaluate(self.dev)

        if self.segmenter_type == TREE:
            self.optimize_tree(iterations)
        elif self.segmenter_type == CHUNK:
            self.optimize_chunk(iterations)

    def optimize_chunk(self, iterations):
        """ optimize the model """
        for i in xrange(iterations):
            for tree in iterview(self.train,
                                 colored('Pass %s' % (i + 1), 'blue')):
                gt0, gt, ge = self.features.potentials_catchup(
                    tree, self.updater.w, self.updater)
                dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(
                    ge)
                self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt)
                self.features.update(tree, dge, dgt0, dgt, self.updater)
                self.updater.step += 1

            self.save_segmenter(self.weights, i)
            self.decode(self.train, VITERBI)
            self.decode(self.dev, VITERBI)
            test_acc, test_f1 = self.decode(self.test, VITERBI)
            logging.info("chunk epoch {0} train acc: {1}".format(*(i,
                                                                   train_acc)))
            logging.info("chunk epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info("chunk epoch {0} test acc: {1}".format(*(i,
                                                                  test_acc)))
            logging.info("chunk epoch {0} train f1: {1}".format(*(i,
                                                                  train_f1)))
            logging.info("chunk epoch {0} dev f1: {1}".format(*(i, dev_f1)))
            logging.info("chunk epoch {0} test f1: {1}".format(*(i, dev_f1)))

    def optimize_tree(self, iterations):
        """ optimize the model """
        for i in xrange(iterations):
            for tree in iterview(self.train,
                                 colored('Pass %s' % (i + 1), 'blue')):
                psi = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
                dpsi = self.segmenter.dll(tree, psi)
                self.features.update(tree, dpsi, self.updater)
                self.updater.step += 1

            self.save_segmenter(self.weights, i)
            self.decode(self.train, VITERBI)
            self.decode(self.dev, VITERBI)
            test_acc, test_f1 = self.decode(self.test, VITERBI)
            logging.info("tree epoch {0} train acc: {1}".format(*(i,
                                                                  train_acc)))
            logging.info("tree epoch {0} dev acc: {1}".format(*(i, dev_acc)))
            logging.info("tree epoch {0} test acc: {1}".format(*(i, test_acc)))
            logging.info("tree epoch {0} train f1: {1}".format(*(i, train_f1)))
            logging.info("tree epoch {0} dev f1: {1}".format(*(i, dev_f1)))
            logging.info("tree epoch {0} test f1: {1}".format(*(i, test_f1)))

    def optimize_joint(self, iterations, num_samples=10, eta1=0.0, eta2=0.0):
        """ optimize jointly using importance sampling """
        # TODO: unit test
        self.updater.eta = eta1
        for i in xrange(iterations):
            samples = self.transducer.sample(self.train, num=num_samples)
            for tree, sample in iterview(zip(self.train, samples),
                                         colored('Pass %s' % (i + 1), 'blue')):
                # compute approximate partition function
                logZ = NINF
                strings, weights = [], []
                for (ur, count) in sample.items():
                    score = self.transducer.ll(tree, ur)
                    if self.segmenter_type == CHUNK:
                        score += self.score_chunk(tree, ur)
                    elif self.segmenter_type == TREE:
                        score += self.score_tree(tree, ur)
                    # TODO: double check
                    logZ = logaddexp(logZ, score)
                    weights.append(score)
                    strings.append(ur)

                #TODO: make more elegant
                tmp = []
                for weight in weights:
                    tmp.append(weight - logZ)  # TODO: double check
                weights = tmp

                # take a tranducer weight gradient step with the importance sampling
                self.transducer.step_is(tree, strings, weights, eta=eta2)
                # take a segmenter weight gradient step with the importance sampling
                for ur, weight in zip(sample, weights):
                    if self.segmenter_type == CHUNK:
                        self.is_chunk(tree, ur, weight)
                    elif self.segmenter_type == TREE:
                        self.is_tree(tree, ur, weight)
                self.updater.step += 1

    def is_chunk(self, tree, ur, weight):
        """ importance sampling gradient step tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        dgt0, dgt, dge = zeros_like(gt0), zeros_like(gt), zeros_like(ge)
        self.segmenter.dll(tree, ge, gt0, gt, dge, dgt0, dgt)
        dgt0 *= weight
        dgt *= weight
        dge *= weight
        self.features.update(tree, dge, dgt0, dgt, self.updater)

    def is_tree(self, tree, ur, weight):
        """ importance sampling gradient step chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        dpsi = self.segmenter.dll(tree, psi)
        dpsi *= weight
        self.features.update(tree, dpsi, self.updater)

    def baseline_ur(self, data):
        """ baseline ur """
        for tree in iterview(data, colored('Updating Baseline UR', 'red')):
            tree.ur_samples = []
            tree.ur_samples.append(tree.sr)

    def decode_ur(self, data):
        """ decodes the UR """
        for tree in iterview(data, colored('Updating Viterbi UR', 'red')):
            tree.ur_samples = []
            viterbi_ur = self.transducer.decode(tree)[1]
            tree.ur_samples.append(viterbi_ur)

    def oracle_ur(self, data):
        """ uses the oracle  UR """
        for tree in iterview(data, colored('Updating Oracle UR', 'red')):
            tree.ur_samples = []
            tree.ur_samples.append(tree.ur_gold)

    def sample_ur(self, data, num_samples=1000):
        """ samples the UR """
        samples = self.transducer.sample(data, num=num_samples)
        for tree, samples in iterview(zip(data, samples),
                                      colored('Sampling', 'red')):
            tree.ur_samples = []
            viterbi_ur = self.transducer.decode(tree)[1]
            tree.ur_samples.append(viterbi_ur)
            for sample, count in samples.items():
                tree.ur_samples.append(sample)

    def decode_chunk(self, tree, ur):
        """ decodes a chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        best, segments, labels = self.segmenter.decode(tree, ge, gt0, gt)
        truth = [tree.ur[i:j] for i, j in tree.indices]
        guess = [tree.ur[i:j] for i, j in segments]
        return truth, guess

    def decode_tree(self, tree, ur):
        """ decodes a tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        max_score, tree_string, max_spans = self.segmenter.argmax(
            tree.M, self.G, psi, tree.ur)

        gold_spans = set(tree.spans)
        guess_spans = set(tree.spans)
        p, r = 0.0, 0.0
        for span in gold_spans:
            if span in guess_spans:
                p += 1.0
        p /= len(gold_spans)
        for span in guess_spans:
            if span in gold_spans:
                r += 1.0
        r /= len(guess_spans)
        f1 = (2 * p * r) / (p + r)

        # TODO: horrible hack
        segmentation = tree_string.replace("(",
                                           "").replace(")",
                                                       "").replace(" ", "")
        for i in xrange(100):
            segmentation = segmentation.replace(str(i), "")
        segmentation = segmentation.split(":")
        guess = segmentation[:-1]
        truth = [x[0] for x in to_segmentation(tree.root)]
        return truth, guess, f1

    def score_chunk(self, tree, ur):
        """ scores a chunk """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        M = tree.M
        gt0, gt, ge = self.features.potentials_catchup(tree, self.updater.w,
                                                       self.updater)
        return self.segmenter.logZ(ge, gt0, gt, M)

    def score_tree(self, tree, ur):
        """ scores a tree """
        tree.update_ur(ur)
        self.features.featurize_instance(tree)
        M = tree.M
        psi = self.features.potentials_catchup(tree, self.updater.w,
                                               self.updater)
        return self.segmenter.logZ(M, self.G, psi)

    def decode(self, data, data_type, decode_type=None, sep=u"#"):
        """ decode the chunker """
        if decode_type is None:
            decode_type = self.decode_type

        if decode_type == ORACLE:
            self.oracle_ur(data)
        elif decode_type == BASELINE:
            self.baseline_ur(data)
        elif decode_type == VITERBI:
            self.decode_ur(data)
        elif decode_type == SAMPLE:
            self.sample_ur(data)
        else:
            raise Exception('Illicit Decode Type')

        ur_correct, ur_total = 0, 0
        correct, f1, tree_f1, lev, total = 0, 0, 0, 0, 0
        for tree in iterview(data, colored('Decoding', 'red')):
            max_ur, max_score = None, NINF
            counter = 0
            for ur in tree.ur_samples:
                tree.update_ur(ur)
                counter += 1
                score = 0.0
                score = self.transducer.ll(tree, ur)
                #print
                #print "LL", self.transducer.ll(tree, ur)
                if self.segmenter_type == CHUNK:
                    score += self.score_chunk(tree, ur)
                    #print "SCORE", self.score_chunk(tree, ur)
                    #print ur
                    #raw_input()
                elif self.segmenter_type == TREE:
                    score += self.score_tree(tree, ur)
                # take the best importance sample
                if score >= max_score:
                    max_score = score
                    max_ur = ur
                    #print "counter", counter
            if max_ur == tree.ur_gold:
                ur_correct += 1
            ur_total += 1
            truth, guess, tree_f1_tmp = None, None, None
            if self.segmenter_type == CHUNK:
                truth, guess = self.decode_chunk(tree, max_ur)
            elif self.segmenter_type == TREE:
                truth, guess, tree_f1_tmp = self.decode_tree(tree, max_ur)
                tree_f1 += tree_f1_tmp

            # ACCURACY
            if truth == guess:
                correct += 1
            # LEVENSHTEIN
            lev += Levenshtein.distance(sep.join(truth), sep.join(guess))
            # F1
            set1, set2 = set(guess), set(truth)
            p, r = 0, 0
            for e in set1:
                if e in set2:
                    p += 1
            for e in set2:
                if e in set1:
                    r += 1
            p /= len(set1)
            r /= len(set2)
            if p + r > 0:
                f1 += 2 * p * r / (p + r)
            total += 1

        logging.info("decoder type: {0}".format(decode_type))
        logging.info("{0} ur acc: {1}".format(*(data_type,
                                                ur_correct / total)))
        logging.info("{0} seg acc: {1}".format(*(data_type, correct / total)))
        logging.info("{0} f1: {1}".format(*(data_type, f1 / total)))
        logging.info("{0} edit: {1}".format(*(data_type, lev / total)))
        if self.segmenter_type == TREE:
            logging.info("{0} tree f1: {1}".format(*(data_type,
                                                     tree_f1 / total)))

    def process(self, fname, maximum=100):
        """ 
        Put the string data into the data structures
        necessary for training and decoding the model. 
        """
        processed = []
        data = Data(fname)
        for counter, (sr, (tree, (indices, index_labels))) in enumerate(data):
            if counter == maximum:
                break
            ur = to_string(tree)
            for s in list(sr):
                self.Sigma.add(s)
            for s in list(ur):
                self.Sigma.add(s)

            spans, labels = [], []
            for node in walk(tree):
                spans.append((node.i, node.j))
                labels.append(node.label)
            t = Tree(self.G, sr, ur, spans, labels, indices, index_labels,
                     len(spans), tree)
            processed.append(t)

        return processed
Пример #19
0
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            ur1 = instance.ur
            results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
            ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
            score, ur2 = results[0], results[1]
            if ur1 == ur2:
                correct += 1
            print ur1, ur2
            total += 1
            counter += 1
        print
        return float(correct) / total

    def ll(self, tree, ur):
        """ gets the log-likelihood """
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
    
if __name__ == "__main__":
    data = [("hablar", "hablando"), ("comer", "comiendo")]
    Sigma = Alphabet()
    Sigma.add("")
    for (x, y) in data:
        for c in list(x):
            Sigma.add(c)
        for c in list(y):
            Sigma.add(c)
            
    tm = TransductionModel(Sigma, data)
    profile.runctx("tm.train()", locals(), globals())
Пример #20
0
class TransducerModel(object):
    """ Transducer model """

    def __init__(self, train, dev, test, Sigma, IL=6, L=2, eta=0.01, C=0.0001):
        self.train = train
        self.dev = dev
        self.test = test
        self.Sigma = Sigma
        assert self.Sigma[""] == 0
        self.IL = IL
        self.C = C
        self.L = L
        self.eta = eta

        # X and Y
        self.X, self.Y = Alphabet(), Alphabet()
        self.X.add(""); self.Y.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.X.add(s)
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.Y.add(s)
        self.X.freeze(); self.Y.freeze()

        # first order (possibly extend)
        self.P = Alphabet()
        self.P.add("")
        for s, si in self.Sigma.items():
            if si == 0:
                continue
            self.P.add(s)
        self.P.add("oo")
        self.P.add("nn")
        self.P.add("yy")
        self.P.add("ss")
        self.P.add("ee")
        self.P.freeze()
        
        # create Z
        self.Z = Alphabet()
        self.Z[""] = 0
        for p, pi in self.P.items():
            for o, oi in self.Y.items():
                 z = p+o
                 self.Z.add(z)
        self.Z.freeze()
        
        # model
        self.model = Transducer(self.Sigma, self.X, self.Y, self.P, self.Z, IL = self.IL)
        self.features = TransducerFeatures(self.X, self.Y, self.P)
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')
        
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d, L=self.L, C=self.C, eta=self.eta, fudge=1e-4)
        self.updater.w[0] = 10.0
        self.updater.w[1] = -10.0

    def optimize(self, iterations=10, start=0):
        """ optimize the model  """
        #np.random.shuffle(self.train)
        for i in xrange(iterations):
            for instance in iterview(self.train, colored('Pass %s' % (i+1+start), 'blue')):
                psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
                dpsi = zeros_like(psi)
                x, y = instance.sr, instance.ur
                #print "LL", self.model.ll(x, y, psi, minx=MINX, miny=MINY)
                dpsi = self.model.dll(x, y, psi, minx=MINX, miny=MINY)
                self.features.update(instance, dpsi, self.updater)
                self.updater.step += 1

    def step_is(self, tree, strings, weights, eta=0.0):
        """ optimize the model  """
        self.updater.eta = eta
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        dpsi = zeros_like(psi)
        dpsi = self.model.dll_is(tree.sr, tree.ur, strings, weights, psi, minx=MINX, miny=MINY)
        self.features.update(tree, dpsi, self.updater)
        self.updater.step += 1
                
    def sample(self, data, num=1000):
        """ sample """
        samples = []
        inside = 0
        correct1, correct2, total = 0, 0, 0
        for instance in iterview(data, colored('Sampling', 'green')):
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            sr = instance.sr
            dist = {}
            for s in self.model.sample(sr, psi, num=num):
                output = ""
                for x, y in s:
                    output += y
                if output not in dist:
                    dist[output] = 0
                dist[output] += 1

            count = dist[instance.ur_gold] if instance.ur_gold in dist else 0
            decoded = self.decode(instance)[1]

            if decoded != instance.ur_gold and count > 0:
                inside += 1
            if decoded == instance.ur_gold:
                correct1 += 1
            if instance.ur_gold in dist:
                correct2 += 1
            total += 1
            samples.append(dist)
            
        # TODO: put into log
        #print ; print inside
        #print correct1 / total, correct2 / total
        return samples

    def decode(self, instance):
        """ Decodes an instance """
        psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
        ur1 = instance.ur
        results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
        return results
        
    def evaluate(self, data, maximum=100000000):
        """ decode the model """
        correct, total = 0, 0
        counter = 0
        for instance in iterview(data, colored('Decoding', 'red')):
            if counter == maximum:
                break
            psi = self.features.potentials_catchup(instance, self.updater.w, self.updater)
            ur1 = instance.ur
            results = self.model.decode(instance.sr, psi, minx=MINX, miny=MINY)
            ll = self.model.ll(instance.sr, ur1, psi, minx=MINX, miny=MINY)
            score, ur2 = results[0], results[1]
            if ur1 == ur2:
                correct += 1
            print ur1, ur2
            total += 1
            counter += 1
        print
        return float(correct) / total

    def ll(self, tree, ur):
        """ gets the log-likelihood """
        psi = self.features.potentials_catchup(tree, self.updater.w, self.updater)
        return self.model.ll(tree.sr, ur, psi, minx=MINX, miny=MINY)
Пример #21
0
from arsenal.alphabet import Alphabet
from arsenal.iterview import progress
from arsenal.terminal import colors
from collections import Counter, defaultdict
from grafl.test import make_model_func
from grafl.dataset.edge_dataset import BWD_dataset

np.set_printoptions(precision=4)

L = {
    0: 'coordinate',
    1: 'hypernym',
    2: 'hyponym',
}

A = Alphabet()
A.map([x.strip().split()[1] for i, x in enumerate(file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2])

tst = BWD_dataset('test').data
trn = BWD_dataset('train').data
trn_x = trn[0]
trn_y = trn[1]
seen = set(trn_x.flatten()) | set(trn_y.flatten())

X,Y,_ = tst

X = list(A.lookup_many(X.flatten()))
Y = list(A.lookup_many(Y.flatten()))
#D = np.array([X,Y,L.flatten()]).T

model_file = 'res/experiments/BWD-projection-Softmax_best.pkl'
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)
        A.add_many(f for token in x.sequence for f in token.attributes)
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Пример #23
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)  # add labels to label domain
        # extract features of the target path
        F = x.F
        path = x.truth
        A.add_many(F(0, None, path[0]))
        A.add_many(k for t in xrange(1, x.N) for k in F(t, path[t-1], path[t]))
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Пример #24
0
 def __init__(self, prefix_length=0, suffix_length=4):
     self.prefix_length, self.suffix_length = prefix_length, suffix_length
     self.attributes = Alphabet()
     self.word2attributes = {}
     self.words = Alphabet()
Пример #25
0
def build_domain(data):
    """
    Do feature extraction to determine the set of *supported* featues, i.e.
    those active in the ground truth configuration and active labels. This
    function will each features and label an integer.
    """
    L = Alphabet()
    A = Alphabet()
    for x in data:
        L.add_many(x.truth)  # add labels to label domain
        # extract features of the target path
        F = x.F
        path = x.truth
        A.add_many(F(0, None, path[0]))
        A.add_many(k for t in xrange(1, x.N)
                   for k in F(t, path[t - 1], path[t]))
    # domains are now ready
    L.freeze()
    A.stop_growth()
    return (L, A)
Пример #26
0
class Lexicon(object):
    """ Reads in the universal morpholigcal lexicon """

    def __init__(self, fin, atts, vals, av, avs):
        # probably redundant...
        # but not optimizing for space so who cares
        self.atts, self.vals, self.av, self.avs = atts, vals, av, avs

        self.lexicon = dd(list)
        self.words = Alphabet()
        
        with codecs.open(fin, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":
                    continue
                word, lemma, tags = line.split(" ")
                self.words.add(word)
                tags = tags.split(",")

                for tag in tags:
                    if len(tag.split("=")) != 2:
                        print line
                        print tag
                    a, v = tag.split("=")
                    self.av[a].add(v)

                    self.atts.add(a)
                    self.vals.add(v)

                self.lexicon[word].append((lemma, tags))

        # get rid of default dict wrapper
        self.lexicon = dict(self.lexicon)
        self.av = dict(self.av)
        
        for a, s in self.av.items():
            for v in s:
                self.avs.add((a, v))

    def create_vectors(self):
        self.N = len(self.avs)
        self.W = zeros((len(self.lexicon), self.N))
        
        # use Manaal's encoding (http://arxiv.org/abs/1512.05030)
        for w, lst in self.lexicon.items():
            vec = zeros((self.N))
            for l, ts in lst:
                for tag in ts:
                    a, v = tag.split("=")

                    #if a != "pos":
                    #    continue
                    
                    j = self.avs[(a, v)]
                    vec[j] = 1.0
            i = self.words[w]
            self.W[i] = vec

        
    def pp(self, word):
        """ pretty print the morphological tag of a word """
        i = self.words[word]
        lst = []
        for n in xrange(self.N):
            if self.W[i, n] > 0:
                lst.append("=".join(self.avs.lookup(n)))
        return word, ",".join(lst)

    
    def __getitem__(self, word):
        i = self.words[word]
        return self.W[i]
Пример #27
0
    def __init__(self,
                 train,
                 dev,
                 test,
                 decode_type,
                 split_num,
                 log_fname=None,
                 segmenter_type='tree',
                 G=3,
                 weights='weights',
                 alphabet=None,
                 T_L=2,
                 T_eta=1.0,
                 T_C=0.0000001,
                 S_L=2,
                 S_eta=1.0,
                 S_C=0.00000001):
        # set up the logging system
        log = logging.getLogger('')
        log.setLevel(logging.INFO)
        format = logging.Formatter("%(asctime)s<>%(levelname)s<>%(message)s")

        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(format)
        log.addHandler(ch)

        fh = logging.handlers.RotatingFileHandler(log_fname,
                                                  maxBytes=(1048576 * 5),
                                                  backupCount=7)
        fh.setFormatter(format)
        log.addHandler(fh)

        self.G = G
        self.split_num = split_num
        self.segmenter_type = segmenter_type
        self.decode_type = decode_type
        self.S_L, self.S_eta, self.S_C = S_L, S_eta, S_C
        self.T_L, self.T_eta, self.T_C = T_L, T_eta, T_C
        self.sig = "split={0},L={1},C={2},eta={3},type={4}".format(
            *(self.split_num, self.T_L, self.T_C, self.T_eta,
              self.decode_type))
        logging.info("Transducer Regularizer Type: L={0}".format(T_L))
        logging.info("Transducer Regularizer Coefficient: C={0}".format(T_C))
        logging.info("Transducer Learning Rate: eta={0}".format(T_eta))
        logging.info("Segmenter Regularizer Type: L={0}".format(S_L))
        logging.info("Segmenter Regularizer Coefficient: C={0}".format(S_C))
        logging.info("Segmenter Learning Rate: eta={0}".format(S_eta))

        self.weights = weights
        self.Sigma = Alphabet()
        self.Sigma.add("")  # add epsilon at 0
        self.Sigma.add("o")
        self.Sigma.add("n")
        self.Sigma.add("y")
        self.Sigma.add("s")
        self.Sigma.add("e")

        if alphabet is not None:
            self.Sigma = self.Sigma.load(alphabet)

        # processs the data
        # TODO: modularize
        self.train = self.process(train, 100000)
        self.dev = self.process(dev, 1000)
        self.test = self.process(test, 1000)

        # dump the alphabet
        self.Sigma.save("alphabets/sigma-{0}.alphabet".format(self.split_num))
        # create model
        self.segmenter = None
        logging.info("Segmenter Type: {0}".format(self.segmenter_type))
        logging.info("Decoder Type: {0}".format(self.decode_type))

        if self.segmenter_type == TREE:
            self.segmenter = TreeSegmenter(G)
            self.features = TreeFeatures(self.G, 1000)
        elif self.segmenter_type == CHUNK:
            self.segmenter = ChunkSegmenter(G)
            self.features = ChunkFeatures(self.G, 1000)
        else:
            raise Exception('Illicit Model Type')

        # transducer
        self.transducer = TransducerModel(self.train,
                                          self.dev,
                                          self.test,
                                          self.Sigma,
                                          L=self.T_L,
                                          eta=self.T_eta,
                                          C=self.T_C)

        # extract features
        self.features.featurize(self.train, 'train')
        self.features.featurize(self.dev, 'dev')
        self.features.featurize(self.test, 'test')

        # dimension of data
        self.d = 2**22 + self.features.offset
        self.updater = LazyRegularizedAdagrad(self.d,
                                              L=2,
                                              C=self.S_C,
                                              eta=0.1,
                                              fudge=1e-4)
        self.updater.w[0] = 10
        self.updater.w[1] = 10
Пример #28
0
class PTB(object):
    "Load the POS-tagged Penn Treebank."

    def __init__(self, base, coarse=True):
        self.base = base
        self.coarse = coarse
        self.Y = Alphabet()   # tag set
        self.V, self.V_freq = Alphabet(), {} # vocabulary
        self.V2Y, self.Y2V = dd(set), dd(set)
        self.train, self.dev, self.test = [], [], []
        self.prefixes, self.suffixes = {}, {}
        self.prefix2int, self.suffix2int = {}, {}

        # Read data and create standard splits according to
        # http://aclweb.org/aclwiki/index.php?title=POS_Tagging_(State_of_the_art)
        #
        # train split [0,18]
        for sectionid in range(19):
            read = self.read_section(sectionid)
            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.train.append(sentence)
                for y, w in sentence:
                    self.V.add(w)
                    self.V2Y[w].add(self.Y.lookup(y))
                    self.Y2V[self.Y.lookup(y)].add(w)
                    if w not in self.V_freq:
                        self.V_freq[w] = 0
                    self.V_freq[w] += 1
                    for prefix in self.extract_prefixes(w):
                        if prefix not in self.prefixes:
                            self.prefixes[prefix] = 0
                        self.prefixes[prefix] += 1
                    for suffix in self.extract_suffixes(w):
                        if suffix not in self.suffixes:
                            self.suffixes[suffix] = 0
                        self.suffixes[suffix] += 1

        # dev split [19,21]
        for sectionid in range(19, 22):
            read = self.read_section(sectionid)

            for sentence in read:
                #for tag, word in sentence:
                #    if tag == self.Y["BAD"]:
                #        break

                self.dev.append(sentence)

        # test split [22,24]
        for sectionid in range(22, 25):
            #for tag, word in sentence:
            #    if tag == self.Y["BAD"]:
            #        break

            self.test.extend(self.read_section(sectionid))
        self.Y.freeze()

    def extract_prefixes(self, w, n=10):
        """ gets prefixes up to length n """
        prefixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[:i]
            if segment not in self.prefix2int:
                self.prefix2int[segment] = len(self.prefix2int)
            prefixes.append(w[:i])
        return prefixes

    def extract_suffixes(self, w, n=10):
        """ gets suffixes up to lenght n """
        suffixes = []
        for i in range(1, min(len(w)+1, n+1)):
            segment = w[-i:]
            if segment not in self.suffix2int:
                self.suffix2int[segment] = len(self.suffix2int)
            suffixes.append(w[-i:])
        return suffixes

    def tag_bigrams(self):
        """ extract all tag bigrams """
        bigram2count = {}
        for sentence in self.train:
            for (tag1, _), (tag2, _) in zip(sentence, sentence[1:]):
                key = (tag1, tag2)
                if key not in bigram2count:
                    bigram2count[key] = 0
                bigram2count[key] += 1
        return bigram2count

    def read_section(self, sectionid):
        "Read a section number `sectionid` from the PTB."
        root = os.path.join(self.base, str(sectionid).zfill(2))
        for fname in os.listdir(root):
            if not fname.endswith('pos.gz'):
                continue
            with gzip.open(os.path.join(root, fname), 'rb') as f:
                for chunk in f.read().split('======================================'):
                    if chunk.strip():
                        if self.coarse:
                            # STUPID BIO ENCODING
                            #yield [(self.Y["NNP"] if "NNP" in y else self.Y["OTHER"], w) for w, y in re_tagged.findall(chunk)]
                            # Note: clean up punc reduction
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y[0]], w) for w, y in re_tagged.findall(chunk)]
                        else:
                            # TODO: what to do able bars in the tags?
                            # FIND OUT AND CLEAN UP
                            yield [(self.Y["PUNC"] if y in PUNC else self.Y[y.split("|")[0]] if "|" in y else self.Y[y], w) for w, y in re_tagged.findall(chunk)]

    def pp(self, sentence):
        "Pretty print."
        return ' '.join('%s/%s' % (w, self.Y.lookup(t)) for (t, w) in sentence)
Пример #29
0
from arsenal.alphabet import Alphabet
from arsenal.iterview import progress
from arsenal.terminal import colors
from collections import Counter, defaultdict
from grafl.test import make_model_func
from grafl.dataset.edge_dataset import BWD_dataset

np.set_printoptions(precision=4)

L = {
    0: 'coordinate',
    1: 'hypernym',
    2: 'hyponym',
}

A = Alphabet()
A.map([
    x.strip().split()[1] for i, x in enumerate(
        file('res/bowman_wordnet_longer_shuffled_synset_relations.map'))
    if i > 2
])

tst = BWD_dataset('test').data
trn = BWD_dataset('train').data
trn_x = trn[0]
trn_y = trn[1]
seen = set(trn_x.flatten()) | set(trn_y.flatten())

X, Y, _ = tst

X = list(A.lookup_many(X.flatten()))
Пример #30
0
def main():
    datafile = sys.argv[1]

    train, test = traintest(datafile)
    print 'train: %s, test: %s' % (len(train), len(test))

    from scipy.sparse import dok_matrix
    from sklearn import linear_model
    from sklearn.svm import SVC
    from arsenal.alphabet import Alphabet

    N_FEATURES = 100000

    alphabet = Alphabet(random_int=N_FEATURES)

    def _f1(name, data, c, verbose=True):
        if verbose:
            print
            print name
        f = F1()
        for (i, x) in enumerate(data):

            phi = dok_matrix((1, N_FEATURES))
            for k in x.features:
                phi[0, alphabet[k] % N_FEATURES] = 1.0

            [y] = c.predict(phi)
            f.report(i, y, x.label)
        f.scores(verbose=verbose)
        return f

    X = dok_matrix((len(train), N_FEATURES))

    M = len(train)

    Y = []
    X = dok_matrix((M, N_FEATURES))
    for i, x in enumerate(train):
        # binary features
        for k in x.features:
            X[i, alphabet[k] % N_FEATURES] = 1.0
        Y.append(x.label)
    X = X.tocsc()

    c = SVC(class_weight={
        'author': 1000,
        'title': 1000,
        'other': 1.0
    },
            verbose=1)

    c.fit(X, Y)

    _f1('train', train, c)
    ff = _f1('test', test, c, verbose=1)

    if 0:
        import numpy as np
        import matplotlib.pyplot as pl
        from mpl_toolkits.mplot3d import Axes3D
        ax = pl.figure().add_subplot(111, projection='3d')

        pl.ion()

        data = []

        for (author_weight,
             title_weight) in iterview(np.random.uniform(1, 10,
                                                         size=(100, 2))):
            print
            print 'params:', (author_weight, title_weight)

            c = SVC(class_weight={
                'author': author_weight,
                'title': title_weight,
                'other': 1.0
            },
                    verbose=1)

            #c = linear_model.SGDClassifier()
            c.fit(X, Y)

            #_f1('train', train, c)
            ff = _f1('test', test, c, verbose=1)

            score = sum(x for (_, _, _, _, x) in ff.scores(verbose=0))

            data.append((author_weight, title_weight, score))
            print 'score:', score

            x, y, z = zip(*data)
            ax.clear()
            ax.scatter(x, y, z)
            ax.figure.canvas.draw()

        print 'done'
        pl.ioff()
        pl.show()
Пример #31
0
def main():
    from arsenal.alphabet import Alphabet
    from arsenal.maths import spherical

    D = 100

    alphabet = Alphabet(random_int=D)
    weights = spherical(D)
    direction = spherical(D)

    #sentence = 'Papa ate the caviar with the spoon in the park .'.split()
    sentence = 'Papa ate the caviar with the spoon .'.split()

    if 0:
        grammar = """
        S       S .
        S       NP VP
        NP      D N
        NP      NP PP
        VP      V NP
        VP      VP PP
        PP      P NP
        NP      Papa
        N       caviar
        N       spoon
        N       park
        V       ate
        P       with
        P       in
        D       the
        """

    else:
        grammar = """
        S       X .
        X       X X
        X       Papa
        X       ate
        X       the
        X       caviar
        X       with
        X       spoon
        X       in
        X       park
        """

    rhs = load_grammar(grammar)

    if 1:
        # This code branch enumerates all (exponentially many) valid
        # derivations.
        root = semiring_enumeration(sentence, rhs)
        for d in root.x:
            print(post_process(d))
        assert len(root.x) == len(set(root.x))

    def binary_features(sentence,X,Y,Z,i,j,k):
        return alphabet.map(['%s -> %s %s [%s,%s,%s]' % (X,Y,Z,i,j,k)])

    def unary_features(sentence,X,Y,i,k):
        return alphabet.map(['%s -> %s [%s,%s]' % (X,Y,i,k)])

    root = semiring_mert(sentence, rhs, weights, direction, binary_features, unary_features)

    mert_derivations = [] #set()
    for x in root.points:
        print(x)
        d = x.derivation()
        mert_derivations.append(d)

    assert len(mert_derivations) == len(set(mert_derivations))
    #root.draw()

    # Compare the set of derivations found by the MERT semiring to 'brute force'
    # linesearch. Note: Linesearch might only find a subset of derivations found
    # by MERT if the grid isn't fine enough.
    brute_derivations = set()
    for step in np.linspace(-20,20,1000):
        root = semiring_linesearch(sentence, rhs, weights, step, direction, binary_features, unary_features)
        d = root.derivation()
        brute_derivations.add(d)

    # NOTE: need to take upper hull of mert (so it's currently an over estimate)
    print('mert:', len(mert_derivations))
    print('brute:', len(brute_derivations))
    assert brute_derivations.issubset(mert_derivations)