Пример #1
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """
    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data]
    return (L, F, I)
Пример #2
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """
    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32))
         for label, features in data]
    return (L, F, I)
Пример #3
0
class Dataset(object):

    def __init__(self, train, dev, test):
        self.train = train
        self.dev = dev
        self.test = test
        # indexes will be populated by `_index`.
        self.Y = Alphabet()          # tag set
        self.V = Alphabet()          # vocabulary
        self.V_freq = Counter()      # token unigram counts
        self.V2Y = defaultdict(set)  # tag dictionary
        self.prefixes = Counter()
        self.suffixes = Counter()
        self._index(self.train)

    def _index(self, data):
        "frequency tables, etc."
        for sentence in data:
            for y, w in sentence:
                self.Y.add(y)
                self.V.add(w)
                self.V2Y[w].add(y)
                self.V_freq[w] += 1
                for prefix in prefixes(w):
                    self.prefixes[prefix] += 1
                for suffix in suffixes(w):
                    self.suffixes[suffix] += 1

    def make_instances(self, fold, cls):
        "Convert tuples in data `fold` to instances of `cls`."
        data = []
        for x in iterview(getattr(self, fold), msg='Features (%s)' % fold):
            tags, tokens = zip(*x)
            data.append(cls(tokens, self.Y.map(tags), self))
        return data

    def tag_ngram_counts(self, n):
        "Returns tag ngram count for subsequences of length n."

#        Y = self.Y

        def tag_sequences():
            """Iterate over tag sequence (as `str` instead of `int`, which is how they are
            stored.).

            """
            for e in self.train:
                y, _ = zip(*e)
#                assert all(isinstance(yy, int) for yy in y), y
#                yield tuple(Y.lookup_many(y))
                yield y

        return ngram_counts(tag_sequences(), n)
Пример #4
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k,v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32)) for label, features in data]
    return (L, F, I)
Пример #5
0
def integerize(data):
    """
    Integerize dataset
    returns a triple (label alphabet, feature alphabet, integerized dataset)
    """

    if do_label_count:
        label_count = defaultdict(int)
        for label, features in data:
            label_count[label] += 1
        label_count = label_count.items()
        label_count.sort(key=lambda x: -x[1])  # sort by count
        print 'label count'
        for k, v in label_count:
            print '%20s => %s' % (k, v)
        sys.exit(0)

    F = Alphabet()
    L = Alphabet()
    I = [(L[label], fromiter(F.map(features), dtype=int32))
         for label, features in data]
    return (L, F, I)
Пример #6
0
from arsenal.iterview import progress
from arsenal.terminal import colors
from collections import Counter, defaultdict
from grafl.test import make_model_func
from grafl.dataset.edge_dataset import BWD_dataset

np.set_printoptions(precision=4)

L = {
    0: 'coordinate',
    1: 'hypernym',
    2: 'hyponym',
}

A = Alphabet()
A.map([x.strip().split()[1] for i, x in enumerate(file('res/bowman_wordnet_longer_shuffled_synset_relations.map')) if i > 2])

tst = BWD_dataset('test').data
trn = BWD_dataset('train').data
trn_x = trn[0]
trn_y = trn[1]
seen = set(trn_x.flatten()) | set(trn_y.flatten())

X,Y,_ = tst

X = list(A.lookup_many(X.flatten()))
Y = list(A.lookup_many(Y.flatten()))
#D = np.array([X,Y,L.flatten()]).T

model_file = 'res/experiments/BWD-projection-Softmax_best.pkl'
#model_file = '/home/timv/Downloads/BWD-projection-identity_sub_glue-Softmax.pkl'
Пример #7
0
from collections import Counter, defaultdict
from grafl.test import make_model_func
from grafl.dataset.edge_dataset import BWD_dataset

np.set_printoptions(precision=4)

L = {
    0: 'coordinate',
    1: 'hypernym',
    2: 'hyponym',
}

A = Alphabet()
A.map([
    x.strip().split()[1] for i, x in enumerate(
        file('res/bowman_wordnet_longer_shuffled_synset_relations.map'))
    if i > 2
])

tst = BWD_dataset('test').data
trn = BWD_dataset('train').data
trn_x = trn[0]
trn_y = trn[1]
seen = set(trn_x.flatten()) | set(trn_y.flatten())

X, Y, _ = tst

X = list(A.lookup_many(X.flatten()))
Y = list(A.lookup_many(Y.flatten()))
#D = np.array([X,Y,L.flatten()]).T