def main(args): if not args: return # last argument is the output file for the pickled lm database outfilename = args.pop() legal_chars = frozenset(' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') # accumulator: positives, total def initial(): return [0, 0] lm = defaultdict(initial) num_bits = 8 bit_backoffs = tuple(initial() for i in xrange(num_bits)) num_samples = 0 # open this here so that we error prior to time-consuming gathering of stats with open(outfilename, 'wb') as outfile: for sample in text_utils.deterministic_labels(filenames_stream(args), legal_chars): num_samples += 1 char_code, prior_codes, bit_index, bit_value = sample # unigrams by bit_index backoff = bit_backoffs[bit_index] backoff[-1] += 1 # prior is two prior char_codes and the bit index key = prior_codes + (bit_index,) stats = lm[key] stats[-1] += 1 if bit_value != 0: stats[0] += 1 backoff[0] += 1 # max likelihood for the bit bit_backoffs = tuple(1 if 2 * samples >= total else 0 for samples, total in bit_backoffs) lm = frozendict((key, 1 if 2 * samples >= total else 0) for key, (samples, total) in lm.iteritems()) cPickle.dump(num_samples, outfile, -1) cPickle.dump(dict(lm), outfile, -1) # verify what we wrote with open(outfilename, 'rb') as infile: num_samples2 = cPickle.load(infile) lm2 = frozendict(cPickle.load(infile)) assert num_samples2 == num_samples assert lm2 == lm print len(args), num_samples, len(lm) print bit_backoffs
def frozenbijection(iterable): """ Return a bijective pair, (by_id, id_by), where by_id is a sorted tuple of the unique items in iterable, and id_by is a frozendict mapping each item to its id. Each item in iterable must be immutable, otherwise you will get a TypeError about an unhashable object. Examples >>> frozenbijection('hello world') ((' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w'), frozendict({' ': 0, 'e': 2, 'd': 1, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7})) >>> frozenbijection(['world', 'wide', 'hello', 'world']) (('hello', 'wide', 'world'), frozendict({'wide': 1, 'hello': 0, 'world': 2})) Example of error on mutable items >>> frozenbijection(['world', 'wide', 'hello', 'world', ['this item is a mutable list']]) #doctest: +ELLIPSIS Traceback (most recent call last): ... frozenbijection(('world', 'wide', 'hello', 'world', ['this item is a mutable list'])) File "<stdin>", line ###, in frozenbijection File "<stdin>", line ###, in sorteduniquetuple TypeError: ...unhashable... """ by_id = sorteduniquetuple(iterable) id_by = frozendict((item, id) for id, item in enumerate(by_id)) return by_id, id_by
def collate_pairs(iterable): """ Collate the iterable sequence of pairs, ((key, value), (key, value), ...). Returns a frozendict in which each key maps to the sequence of values that appeared with that key in the iteration. The collation is stable; that is, the order of the values in the list for a given key is the order in which those values appeared in the iteration. >>> col = collate_pairs(((1, 2), (2, 0), (1, 1), (1, 2), ('a', 'b'), ('a', None))) >>> for key in sorted(col.keys()): print key, col[key] 1 (2, 1, 2) 2 (0,) a ('b', None) """ collator = defaultdict(list) for key, value in iterable: collator[key].append(value) return frozendict((key, tuple(seq)) for key, seq in collator.iteritems())
class text_utils(object): """ >>> for key in sorted(text_utils.numbits_by_char.keys()): print repr(key), text_utils.numbits_by_char[key], ' ' 1 '#' 3 '(' 2 ')' 3 '*' 3 '+' 4 ',' 3 '.' 4 ':' 4 'a' 3 'b' 3 'c' 4 'd' 3 'e' 4 'f' 4 'g' 5 'h' 3 'i' 4 'j' 4 'k' 5 'l' 4 'm' 5 'n' 5 'o' 6 'p' 3 'q' 4 'r' 4 's' 5 't' 4 'u' 5 'v' 5 'w' 6 'x' 4 'y' 5 'z' 5 >>> for key in sorted(text_utils.charset_by_numbits.keys()): print key, text_utils.charset_by_numbits[key] 1 frozenset([' ']) 2 frozenset(['(']) 3 frozenset(['a', '#', 'b', 'd', ')', 'h', '*', ',', 'p']) 4 frozenset(['c', 'e', 'f', 'i', '+', 'j', 'l', '.', 'q', 'r', 't', 'x', ':']) 5 frozenset(['g', 'k', 'm', 'n', 's', 'u', 'v', 'y', 'z']) 6 frozenset(['o', 'w']) >>> text_utils.num_chars_by_bit_index (0, 26, 35, 12, 18, 14, 18, 16) >>> for key in sorted(text_utils.charset_by_bit.keys()): print key, text_utils.charset_by_bit[key] 1 frozenset(['a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']) 2 frozenset([' ', '#', ')', '(', '+', '*', ',', '.', ':', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i', 'h', 'k', 'j', 'm', 'l', 'o', 'n', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']) 3 frozenset([':', 'q', 'p', 's', 'r', 'u', 't', 'w', 'v', 'y', 'x', 'z']) 4 frozenset(['*', 'l', 'n', ':', ')', '(', '+', 'j', 'm', ',', 'o', '.', 'i', 'h', 'y', 'x', 'z', 'k']) 5 frozenset(['e', 'd', 'g', 'f', 'm', ',', 'o', '.', 'u', 't', 'w', 'v', 'n', 'l']) 6 frozenset(['j', '#', 'n', 'c', 'b', 'g', 'f', ':', '+', '*', 'o', '.', 's', 'r', 'w', 'v', 'z', 'k']) 7 frozenset(['c', 'a', '#', 'e', 'g', ')', '+', 'm', 'o', 'q', 'i', 's', 'u', 'w', 'y', 'k']) >>> for index, chars in enumerate(text_utils.charseq_by_bit_index): print index, chars 0 (' ', '#', '(', ')', '*', '+', ',', '.', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') 1 ('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') 2 (' ', '#', '(', ')', '*', '+', ',', '.', ':', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') 3 (':', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z') 4 ('(', ')', '*', '+', ',', '.', ':', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'x', 'y', 'z') 5 (',', '.', 'd', 'e', 'f', 'g', 'l', 'm', 'n', 'o', 't', 'u', 'v', 'w') 6 ('#', '*', '+', '.', ':', 'b', 'c', 'f', 'g', 'j', 'k', 'n', 'o', 'r', 's', 'v', 'w', 'z') 7 ('#', ')', '+', 'a', 'c', 'e', 'g', 'i', 'k', 'm', 'o', 'q', 's', 'u', 'w', 'y') """ legal_chars = frozenset(' abcdefghijklmnopqrstuvwxyz,:()#.*+') num_legal_chars = len(legal_chars) #assert num_legal_chars == 27 # set bit count for each char numbits_by_char = frozendict( (char, sum(char_bits(char))) for char in legal_chars) charset_by_numbits = defaultdict(set) for char, numbits in numbits_by_char.iteritems(): charset_by_numbits[numbits].add(char) charset_by_numbits = frozendict( (numbits, frozenset(chars)) for numbits, chars in charset_by_numbits.iteritems()) # number of chars with that bit set num_chars_by_bit_index = tuple( sum(bits) for bits in izip(*imap(char_bits, legal_chars))) # sets of chars with given bit charset_by_bit = defaultdict(set) for char in legal_chars: for index, bit in enumerate(char_bits(char)): if bit: charset_by_bit[index].add(char) charset_by_bit = frozendict((index, frozenset(chars)) for index, chars in charset_by_bit.iteritems()) charseq_by_bit_index = list() for char in legal_chars: for bit_index, bit_value in enumerate(char_bits(char)): while len(charseq_by_bit_index) <= bit_index: charseq_by_bit_index.append(set()) if bit_value: charseq_by_bit_index[bit_index].add(char) # note: an empty set is replaced by the full legal_chars set since a uniform sample is appropriate charseq_by_bit_index = tuple( tuple(sorted(charset if charset else legal_chars)) for charset, legal_chars in izip(charseq_by_bit_index, repeat(legal_chars))) assert sum(numbits_by_char.values()) == sum(num_chars_by_bit_index) @staticmethod def deterministic_labels(stream, legal_chars=legal_chars): """ Return a generator that yields event labels from the text stream. >>> doc = ''' ... foo ... ... a b ... ''' >>> from cStringIO import StringIO >>> x = tuple(text_utils.deterministic_labels(StringIO(doc))) >>> len(x) 56 >>> x[0:3], x[-3:] (((102, (-1, -1), 0, 0), (102, (-1, -1), 1, 1), (102, (-1, -1), 2, 1)), ((98, (97, 32), 5, 0), (98, (97, 32), 6, 1), (98, (97, 32), 7, 0))) """ numbits_by_char = text_utils.numbits_by_char num_chars_by_bit_index = text_utils.num_chars_by_bit_index num_priors = 2 priors = deque(-1 for x in xrange(num_priors)) sep = () for line in stream_lines(stream): for token in line_tokens(line): for char in chain(sep, token_chars(token)): if char not in legal_chars: continue sep = (' ', ) ord_char = ord(char) tuple_priors = tuple(priors) for bit_index, bit_value in enumerate(char_bits(char)): yield ord_char, tuple_priors, bit_index, bit_value, priors.popleft() priors.append(ord_char) @staticmethod def damage_labels(permil, labels, seed=None): """ Randomly damage the bit-value labels of a label stream. Introduce damage to permil / 1000 of the labels. If seed is not None it is used to reproducibly seed the randomness. >>> from cStringIO import StringIO No damage >>> tuple(text_utils.damage_labels(0, text_utils.deterministic_labels(StringIO('a b cd')), seed=0))[-10:-6] ((99, (98, 32), 6, 1), (99, (98, 32), 7, 1), (100, (32, 99), 0, 0), (100, (32, 99), 1, 1)) Fifty-percent damage >>> tuple(text_utils.damage_labels(500, text_utils.deterministic_labels(StringIO('a b cd')), seed=0))[-10:-6] ((99, (98, 32), 6, 1), (99, (98, 32), 7, 1), (100, (32, 99), 0, 1), (100, (32, 99), 1, 0)) """ mil = 1000 assert 0 <= permil <= mil rand = random.Random() rand.seed(seed) randint = partial(rand.randint, 0, mil) for label in labels: if randint() < permil: char_code, priors, bit_index, bit_value = label # note: 1 - bit_value only works for (0, 1) set of labels yield char_code, priors, bit_index, 1 - bit_value else: yield label @staticmethod def make_model_samplers(means, vars): """ >>> SimpleGaussianModel.seed(0) >>> tuple(int(128 * sampler()) for sampler in text_utils.make_model_samplers((0, 1), (1, 0.5))) (-23, 130) >>> tuple(int(128 * sampler()) for sampler in text_utils.make_model_samplers((0, 1, 2), (1, 0.5, 2))) (89, 119, 511) """ assert len(means) == len(vars) num_classes = len(means) models = tuple( SimpleGaussianModel(1, SimpleGaussianModel.DIAGONAL_COVARIANCE) for i in xrange(num_classes)) for model, mean, var in izip(models, means, vars): model.set_model((mean, ), (var, )) samplers = tuple(model.sample for model in models) assert len(samplers) == num_classes return samplers @staticmethod def generate_samples(samplers, labels): """ >>> from cStringIO import StringIO >>> random.seed(0) >>> SimpleGaussianModel.seed(0) >>> tuple(text_utils.generate_samples(text_utils.make_model_samplers((0, 1), (1, 0.5)), text_utils.deterministic_labels(StringIO('a b cd'))))[-10:-6] (((99, (98, 32), 6, 1), (-20, 115)), ((99, (98, 32), 7, 1), (51, 111)), ((100, (32, 99), 0, 0), (96, 32)), ((100, (32, 99), 1, 1), (182, 109))) """ charseq_by_bit_index = text_utils.charseq_by_bit_index choice = random.choice for label in labels: char_code, priors, bit_index, bit_value = label yield label, (int(128 * samplers[bit_value]()), ord(choice(charseq_by_bit_index[bit_index])))