Пример #1
0
def main(args):
    if not args:
        return

    # last argument is the output file for the pickled lm database
    outfilename = args.pop()

    legal_chars = frozenset(' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')

    # accumulator: positives, total
    def initial(): return [0, 0]
    lm = defaultdict(initial)
    num_bits = 8
    bit_backoffs = tuple(initial() for i in xrange(num_bits))
    num_samples = 0

    # open this here so that we error prior to time-consuming gathering of stats
    with open(outfilename, 'wb') as outfile:

        for sample in text_utils.deterministic_labels(filenames_stream(args), legal_chars):
            num_samples += 1

            char_code, prior_codes, bit_index, bit_value = sample

            # unigrams by bit_index
            backoff = bit_backoffs[bit_index]
            backoff[-1] += 1

            # prior is two prior char_codes and the bit index
            key = prior_codes + (bit_index,)
            stats = lm[key]
            stats[-1] += 1

            if bit_value != 0:
                stats[0] += 1
                backoff[0] += 1

        # max likelihood for the bit
        bit_backoffs = tuple(1 if 2 * samples >= total else 0 for samples, total in bit_backoffs)
        lm = frozendict((key, 1 if 2 * samples >= total else 0) for key, (samples, total) in lm.iteritems())

        cPickle.dump(num_samples, outfile, -1)
        cPickle.dump(dict(lm), outfile, -1)


    # verify what we wrote
    with open(outfilename, 'rb') as infile:
        num_samples2 = cPickle.load(infile)
        lm2 = frozendict(cPickle.load(infile))
        
    assert num_samples2 == num_samples
    assert lm2 == lm

    print len(args), num_samples, len(lm)
    print bit_backoffs
Пример #2
0
def frozenbijection(iterable):
    """
    Return a bijective pair, (by_id, id_by), where by_id is a sorted tuple of
    the unique items in iterable, and id_by is a frozendict mapping each item to
    its id.  Each item in iterable must be immutable, otherwise you will get a
    TypeError about an unhashable object.

    Examples

    >>> frozenbijection('hello world')
    ((' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w'), frozendict({' ': 0, 'e': 2, 'd': 1, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}))
    >>> frozenbijection(['world', 'wide', 'hello', 'world'])
    (('hello', 'wide', 'world'), frozendict({'wide': 1, 'hello': 0, 'world': 2}))

    Example of error on mutable items

    >>> frozenbijection(['world', 'wide', 'hello', 'world', ['this item is a mutable list']]) #doctest: +ELLIPSIS
    Traceback (most recent call last):
      ...
        frozenbijection(('world', 'wide', 'hello', 'world', ['this item is a mutable list']))
      File "<stdin>", line ###, in frozenbijection
      File "<stdin>", line ###, in sorteduniquetuple      
    TypeError: ...unhashable...
    
    """
    by_id = sorteduniquetuple(iterable)
    id_by = frozendict((item, id) for id, item in enumerate(by_id))
    return by_id, id_by
Пример #3
0
def frozenbijection(iterable):
    """
    Return a bijective pair, (by_id, id_by), where by_id is a sorted tuple of
    the unique items in iterable, and id_by is a frozendict mapping each item to
    its id.  Each item in iterable must be immutable, otherwise you will get a
    TypeError about an unhashable object.

    Examples

    >>> frozenbijection('hello world')
    ((' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w'), frozendict({' ': 0, 'e': 2, 'd': 1, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}))
    >>> frozenbijection(['world', 'wide', 'hello', 'world'])
    (('hello', 'wide', 'world'), frozendict({'wide': 1, 'hello': 0, 'world': 2}))

    Example of error on mutable items

    >>> frozenbijection(['world', 'wide', 'hello', 'world', ['this item is a mutable list']]) #doctest: +ELLIPSIS
    Traceback (most recent call last):
      ...
        frozenbijection(('world', 'wide', 'hello', 'world', ['this item is a mutable list']))
      File "<stdin>", line ###, in frozenbijection
      File "<stdin>", line ###, in sorteduniquetuple      
    TypeError: ...unhashable...
    
    """
    by_id = sorteduniquetuple(iterable)
    id_by = frozendict((item, id) for id, item in enumerate(by_id))
    return by_id, id_by
Пример #4
0
def collate_pairs(iterable):
    """
    Collate the iterable sequence of pairs, ((key, value), (key,
    value), ...).  Returns a frozendict in which each key maps to the
    sequence of values that appeared with that key in the iteration.
    The collation is stable; that is, the order of the values in the
    list for a given key is the order in which those values appeared
    in the iteration.

    >>> col = collate_pairs(((1, 2), (2, 0), (1, 1), (1, 2), ('a', 'b'), ('a', None)))
    >>> for key in sorted(col.keys()): print key, col[key]
    1 (2, 1, 2)
    2 (0,)
    a ('b', None)
    """
    collator = defaultdict(list)
    for key, value in iterable:
        collator[key].append(value)
    return frozendict((key, tuple(seq)) for key, seq in collator.iteritems())
Пример #5
0
def collate_pairs(iterable):
    """
    Collate the iterable sequence of pairs, ((key, value), (key,
    value), ...).  Returns a frozendict in which each key maps to the
    sequence of values that appeared with that key in the iteration.
    The collation is stable; that is, the order of the values in the
    list for a given key is the order in which those values appeared
    in the iteration.

    >>> col = collate_pairs(((1, 2), (2, 0), (1, 1), (1, 2), ('a', 'b'), ('a', None)))
    >>> for key in sorted(col.keys()): print key, col[key]
    1 (2, 1, 2)
    2 (0,)
    a ('b', None)
    """
    collator = defaultdict(list)
    for key, value in iterable:
        collator[key].append(value)
    return frozendict((key, tuple(seq)) for key, seq in collator.iteritems())
Пример #6
0
class text_utils(object):
    """
    >>> for key in sorted(text_utils.numbits_by_char.keys()): print repr(key), text_utils.numbits_by_char[key],
    ' ' 1 '#' 3 '(' 2 ')' 3 '*' 3 '+' 4 ',' 3 '.' 4 ':' 4 'a' 3 'b' 3 'c' 4 'd' 3 'e' 4 'f' 4 'g' 5 'h' 3 'i' 4 'j' 4 'k' 5 'l' 4 'm' 5 'n' 5 'o' 6 'p' 3 'q' 4 'r' 4 's' 5 't' 4 'u' 5 'v' 5 'w' 6 'x' 4 'y' 5 'z' 5
    
    >>> for key in sorted(text_utils.charset_by_numbits.keys()): print key, text_utils.charset_by_numbits[key]
    1 frozenset([' '])
    2 frozenset(['('])
    3 frozenset(['a', '#', 'b', 'd', ')', 'h', '*', ',', 'p'])
    4 frozenset(['c', 'e', 'f', 'i', '+', 'j', 'l', '.', 'q', 'r', 't', 'x', ':'])
    5 frozenset(['g', 'k', 'm', 'n', 's', 'u', 'v', 'y', 'z'])
    6 frozenset(['o', 'w'])

    >>> text_utils.num_chars_by_bit_index
    (0, 26, 35, 12, 18, 14, 18, 16)

    >>> for key in sorted(text_utils.charset_by_bit.keys()): print key, text_utils.charset_by_bit[key]
    1 frozenset(['a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
    2 frozenset([' ', '#', ')', '(', '+', '*', ',', '.', ':', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
    3 frozenset([':', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z'])
    4 frozenset(['*', 'l', 'n', ':', ')', '(', '+', 'j', 'm', ',', 'o', '.', 'i', 'h', 'y', 'x', 'z', 'k'])
    5 frozenset(['e', 'd', 'g', 'f', 'm', ',', 'o', '.', 'u', 't', 'w', 'v', 'n', 'l'])
    6 frozenset(['j', '#', 'n', 'c', 'b', 'g', 'f', ':', '+', '*', 'o', '.', 's', 'r', 'w', 'v', 'z', 'k'])
    7 frozenset(['c', 'a', '#', 'e', 'g', ')', '+', 'm', 'o', 'q', 'i', 's', 'u', 'w', 'y', 'k'])

    >>> for index, chars in enumerate(text_utils.charseq_by_bit_index): print index, chars
    0 (' ', '#', '(', ')', '*', '+', ',', '.', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
    1 ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
    2 (' ', '#', '(', ')', '*', '+', ',', '.', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
    3 (':', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z')
    4 ('(', ')', '*', '+', ',', '.', ':', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'x', 'y', 'z')
    5 (',', '.', 'd', 'e', 'f', 'g', 'l', 'm', 'n', 'o', 't', 'u', 'v', 'w')
    6 ('#', '*', '+', '.', ':', 'b', 'c', 'f', 'g', 'j', 'k', 'n', 'o', 'r', 's', 'v', 'w', 'z')
    7 ('#', ')', '+', 'a', 'c', 'e', 'g', 'i', 'k', 'm', 'o', 'q', 's', 'u', 'w', 'y')
    """

    legal_chars = frozenset(' abcdefghijklmnopqrstuvwxyz,:()#.*+')
    num_legal_chars = len(legal_chars)
    #assert num_legal_chars == 27

    # set bit count for each char
    numbits_by_char = frozendict(
        (char, sum(char_bits(char))) for char in legal_chars)

    charset_by_numbits = defaultdict(set)
    for char, numbits in numbits_by_char.iteritems():
        charset_by_numbits[numbits].add(char)
    charset_by_numbits = frozendict(
        (numbits, frozenset(chars))
        for numbits, chars in charset_by_numbits.iteritems())

    # number of chars with that bit set
    num_chars_by_bit_index = tuple(
        sum(bits) for bits in izip(*imap(char_bits, legal_chars)))

    # sets of chars with given bit
    charset_by_bit = defaultdict(set)
    for char in legal_chars:
        for index, bit in enumerate(char_bits(char)):
            if bit:
                charset_by_bit[index].add(char)
    charset_by_bit = frozendict((index, frozenset(chars))
                                for index, chars in charset_by_bit.iteritems())

    charseq_by_bit_index = list()
    for char in legal_chars:
        for bit_index, bit_value in enumerate(char_bits(char)):
            while len(charseq_by_bit_index) <= bit_index:
                charseq_by_bit_index.append(set())
            if bit_value:
                charseq_by_bit_index[bit_index].add(char)
    # note: an empty set is replaced by the full legal_chars set since a uniform sample is appropriate
    charseq_by_bit_index = tuple(
        tuple(sorted(charset if charset else legal_chars))
        for charset, legal_chars in izip(charseq_by_bit_index,
                                         repeat(legal_chars)))

    assert sum(numbits_by_char.values()) == sum(num_chars_by_bit_index)

    @staticmethod
    def deterministic_labels(stream, legal_chars=legal_chars):
        """
        Return a generator that yields event labels from the text stream.

        >>> doc = '''
        ...  foo 
        ...
        ... a b
        ... '''
        >>> from cStringIO import StringIO
        >>> x = tuple(text_utils.deterministic_labels(StringIO(doc)))
        >>> len(x)
        56
        >>> x[0:3], x[-3:]
        (((102, (-1, -1), 0, 0), (102, (-1, -1), 1, 1), (102, (-1, -1), 2, 1)), ((98, (97, 32), 5, 0), (98, (97, 32), 6, 1), (98, (97, 32), 7, 0)))
        """
        numbits_by_char = text_utils.numbits_by_char
        num_chars_by_bit_index = text_utils.num_chars_by_bit_index
        num_priors = 2
        priors = deque(-1 for x in xrange(num_priors))
        sep = ()
        for line in stream_lines(stream):
            for token in line_tokens(line):
                for char in chain(sep, token_chars(token)):
                    if char not in legal_chars:
                        continue
                    sep = (' ', )
                    ord_char = ord(char)
                    tuple_priors = tuple(priors)
                    for bit_index, bit_value in enumerate(char_bits(char)):
                        yield ord_char, tuple_priors, bit_index, bit_value,
                    priors.popleft()
                    priors.append(ord_char)

    @staticmethod
    def damage_labels(permil, labels, seed=None):
        """
        Randomly damage the bit-value labels of a label stream.
        Introduce damage to permil / 1000 of the labels.  If seed is
        not None it is used to reproducibly seed the randomness.

        >>> from cStringIO import StringIO

        No damage
        >>> tuple(text_utils.damage_labels(0, text_utils.deterministic_labels(StringIO('a b cd')), seed=0))[-10:-6]
        ((99, (98, 32), 6, 1), (99, (98, 32), 7, 1), (100, (32, 99), 0, 0), (100, (32, 99), 1, 1))

        Fifty-percent damage
        >>> tuple(text_utils.damage_labels(500, text_utils.deterministic_labels(StringIO('a b cd')), seed=0))[-10:-6]
        ((99, (98, 32), 6, 1), (99, (98, 32), 7, 1), (100, (32, 99), 0, 1), (100, (32, 99), 1, 0))
        """
        mil = 1000
        assert 0 <= permil <= mil
        rand = random.Random()
        rand.seed(seed)
        randint = partial(rand.randint, 0, mil)
        for label in labels:
            if randint() < permil:
                char_code, priors, bit_index, bit_value = label
                # note: 1 - bit_value only works for (0, 1) set of labels
                yield char_code, priors, bit_index, 1 - bit_value
            else:
                yield label

    @staticmethod
    def make_model_samplers(means, vars):
        """
        >>> SimpleGaussianModel.seed(0)
        >>> tuple(int(128 * sampler()) for sampler in text_utils.make_model_samplers((0, 1), (1, 0.5)))
        (-23, 130)
        >>> tuple(int(128 * sampler()) for sampler in text_utils.make_model_samplers((0, 1, 2), (1, 0.5, 2)))
        (89, 119, 511)
        """
        assert len(means) == len(vars)
        num_classes = len(means)
        models = tuple(
            SimpleGaussianModel(1, SimpleGaussianModel.DIAGONAL_COVARIANCE)
            for i in xrange(num_classes))
        for model, mean, var in izip(models, means, vars):
            model.set_model((mean, ), (var, ))
        samplers = tuple(model.sample for model in models)
        assert len(samplers) == num_classes
        return samplers

    @staticmethod
    def generate_samples(samplers, labels):
        """
        >>> from cStringIO import StringIO
        >>> random.seed(0)
        >>> SimpleGaussianModel.seed(0)
        >>> tuple(text_utils.generate_samples(text_utils.make_model_samplers((0, 1), (1, 0.5)), text_utils.deterministic_labels(StringIO('a b cd'))))[-10:-6]
        (((99, (98, 32), 6, 1), (-20, 115)), ((99, (98, 32), 7, 1), (51, 111)), ((100, (32, 99), 0, 0), (96, 32)), ((100, (32, 99), 1, 1), (182, 109)))
        """
        charseq_by_bit_index = text_utils.charseq_by_bit_index
        choice = random.choice
        for label in labels:
            char_code, priors, bit_index, bit_value = label
            yield label, (int(128 * samplers[bit_value]()),
                          ord(choice(charseq_by_bit_index[bit_index])))