def decode(model, istream, ostream, estream=sys.stderr): # reads in the model logging.info('Loading model: %s', model) T, vocab = load_model(model) logging.info('%d patterns and %d entries', len(vocab), T.size) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) documents = read_documents(istream, boundaries) logging.info('%d test documents read', len(documents)) # encode test documents using the model's vocabulary test = encode_test_documents(documents, vocab) # computes the log likelihood of each document L = loglikelihood(test, T) # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, ll/num_sentences, num_patterns, ll/num_patterns) print >> estream, '#sum\t#mean' print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))
def decode(model, istream, ostream, estream=sys.stderr): # reads in the model logging.info('Loading model: %s', model) T, vocab = load_model(model) logging.info('%d patterns and %d entries', len(vocab), T.size) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) documents = read_documents(istream, boundaries) logging.info('%d test documents read', len(documents)) # encode test documents using the model's vocabulary test = encode_test_documents(documents, vocab) # computes the log likelihood of each document L = loglikelihood(test, T) # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format( i, ll, num_sentences, ll / num_sentences, num_patterns, ll / num_patterns) print >> estream, '#sum\t#mean' print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))
def decode(unigrams, bigrams, c, istream, ostream, estream=sys.stderr): # reads in the model logging.info('Loading model: %s and %s', unigrams, bigrams) U, B, vocab = load_model(unigrams, bigrams) logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0]) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # detect whether insertion was swtiched insertion = B[0, :].sum() > 0 # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) documents = read_documents(istream, boundaries) logging.info('%d test documents read', len(documents)) # encode test documents using the model's vocabulary test = encode_test_documents(documents, vocab) # computes the log likelihood of each document L = loglikelihood(test, U, B, c, insertion) # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format( i, ll, num_sentences, ll / num_sentences, num_patterns, ll / num_patterns) print >> estream, '#sum\t#mean' print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))
def decode_many(model, ipaths, opaths, jobs, estream=sys.stderr): # reads in the model logging.info('Loading model: %s', model) T, vocab = load_model(model) logging.info('%d patterns and %d entries', len(vocab), T.size) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) tests = [None] * len(ipaths) for i, ipath in enumerate(ipaths): documents = read_documents(smart_open(ipath), boundaries) logging.info('%s: %d test documents read', ipath, len(documents)) # encode test documents using the model's vocabulary tests[i] = encode_test_documents(documents, vocab) # computes the log likelihood of each document in each test file pool = Pool(jobs) all_L = pool.map(partial(wrapped_loglikelihood, T=T), tests) print >> estream, '#file\t#sum\t#mean' for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L): with smart_open(opath, 'w') as ostream: # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, ll/num_sentences, num_patterns, ll/num_patterns) print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
def decode(unigrams, bigrams, c, istream, ostream, estream=sys.stderr): # reads in the model logging.info('Loading model: %s and %s', unigrams, bigrams) U, B, vocab = load_model(unigrams, bigrams) logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0]) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # detect whether insertion was swtiched insertion = B[0,:].sum() > 0 # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) documents = read_documents(istream, boundaries) logging.info('%d test documents read', len(documents)) # encode test documents using the model's vocabulary test = encode_test_documents(documents, vocab) # computes the log likelihood of each document L = loglikelihood(test, U, B, c, insertion) # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, ll/num_sentences, num_patterns, ll/num_patterns) print >> estream, '#sum\t#mean' print >> estream, '{0}\t{1}'.format(L.sum(), np.mean(L))
def main(args): """load data and optimise the likelihood""" logging.basicConfig( level=(logging.DEBUG if args.verbose else logging.INFO), format='%(levelname)s %(message)s') # read in documents logging.info('Reading documents in ...') documents = read_documents(sys.stdin, args.boundary) logging.info('%d documents, on average %.2f sentences per document', len(documents), np.mean([len(D) for D in documents])) least_common, min_count = find_least_common(documents) if args.unk else ( frozenset(), 0) if args.unk: logging.info('Least common patterns: frequency=%d patterns=%d', min_count, len(least_common)) # decide whether or not there will be a null symbol # encode documents using numpy array of ids T, vocab = encode_documents(documents, ignore=least_common) # gather unigram and bigram counts logging.info('Counting ...') U, B = count(T, len(vocab), insertion=args.insertion) logging.info('%d unigrams, %d bigrams', U.size, B.size) # compute log likelihood logging.info('Computing likelihood ...') ll = loglikelihood(T, U, B, args.smoothing, args.insertion) logging.info('Negative log likelihood %f with c=%f and insertion=%s', -ll, args.smoothing, args.insertion) # dumps U and B in a nice format tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)] V = len(tokens) logging.info('Writing unigrams to: %s', '{0}.unigrams'.format(args.output)) with open('{0}.unigrams'.format(args.output), 'w') as fu: print >> fu, '#pattern\t#count' for u, n in sorted(enumerate(U), key=lambda (u, n): n, reverse=True): print >> fu, '{0}\t{1}'.format(tokens[u], n) logging.info('Writing bigrams to: %s', '{0}.bigrams'.format(args.output)) with open('{0}.bigrams'.format(args.output), 'w') as fb: print >> fb, '#trigger\t#pattern\t#count' for u in xrange(V): # we iterate over triggers so that the most likely ones come first for v in sorted(itertools.ifilter(lambda v: B[u, v], xrange(V)), key=lambda v: B[u, v], reverse=True): print >> fb, '{0}\t{1}\t{2}'.format(tokens[u], tokens[v], B[u, v]) # legacy options: optimise likelihood if args.mle: logging.info('Minimising negative log likelihood') print minimize(T, U, B, args.insertion)
def main(args): """load data and optimise the likelihood""" logging.basicConfig( level=(logging.DEBUG if args.verbose else logging.INFO), format='%(levelname)s %(message)s') # read in documents logging.info('Reading documents in ...') documents = read_documents(sys.stdin, args.boundary) logging.info('%d documents, on average %.2f sentences per document', len(documents), np.mean([len(D)for D in documents])) least_common, min_count = find_least_common(documents) if args.unk else (frozenset(), 0) if args.unk: logging.info('Least common patterns: frequency=%d patterns=%d', min_count, len(least_common)) # decide whether or not there will be a null symbol # encode documents using numpy array of ids T, vocab = encode_documents(documents, ignore=least_common) # gather unigram and bigram counts logging.info('Counting ...') U, B = count(T, len(vocab), insertion=args.insertion) logging.info('%d unigrams, %d bigrams', U.size, B.size) # compute log likelihood logging.info('Computing likelihood ...') ll = loglikelihood(T, U, B, args.smoothing, args.insertion) logging.info('Negative log likelihood %f with c=%f and insertion=%s', -ll, args.smoothing, args.insertion) # dumps U and B in a nice format tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)] V = len(tokens) logging.info('Writing unigrams to: %s', '{0}.unigrams'.format(args.output)) with open('{0}.unigrams'.format(args.output), 'w') as fu: print >> fu, '#pattern\t#count' for u, n in sorted(enumerate(U), key=lambda (u, n): n, reverse=True): print >> fu, '{0}\t{1}'.format(tokens[u], n) logging.info('Writing bigrams to: %s', '{0}.bigrams'.format(args.output)) with open('{0}.bigrams'.format(args.output), 'w') as fb: print >> fb, '#trigger\t#pattern\t#count' for u in xrange(V): # we iterate over triggers so that the most likely ones come first for v in sorted(itertools.ifilter(lambda v: B[u,v], xrange(V)), key=lambda v: B[u,v], reverse=True): print >> fb, '{0}\t{1}\t{2}'.format(tokens[u], tokens[v], B[u,v]) # legacy options: optimise likelihood if args.mle: logging.info('Minimising negative log likelihood') print minimize(T, U, B, args.insertion)
def decode_many(unigrams, bigrams, c, ipaths, opaths, jobs, estream=sys.stderr): # reads in the model logging.info('Loading model: %s and %s', unigrams, bigrams) U, B, vocab = load_model(unigrams, bigrams) logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0]) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # detect whether insertion was swtiched insertion = B[0, :].sum() > 0 # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) tests = [None] * len(ipaths) for i, ipath in enumerate(ipaths): documents = read_documents(smart_open(ipath), boundaries) logging.info('%s: %d test documents read', ipath, len(documents)) # encode test documents using the model's vocabulary tests[i] = encode_test_documents(documents, vocab) # computes the log likelihood of each document in each test file pool = Pool(jobs) all_L = pool.map( partial(wrapped_loglikelihood, U=U, B=B, c=c, insertion=insertion), tests) print >> estream, '#file\t#sum\t#mean' for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L): with smart_open(opath, 'w') as ostream: # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format( i, ll, num_sentences, ll / num_sentences, num_patterns, ll / num_patterns) print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
def main(args): logging.basicConfig( level=(logging.DEBUG if args.verbose else logging.INFO), format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S') # read documents logging.info('Reading documents in...') documents = read_documents(args.input, args.boundary) logging.info('%d documents read', len(documents)) least_common, min_count = find_least_common(documents) if args.unk else (frozenset(), 0) if args.unk: logging.info('Least common patterns: frequency=%d patterns=%d', min_count, len(least_common)) # maps tokens to integer ids (0 is reserved for a special <null> symbol) # and encodes the training data using numpy arrays of vocab ids logging.info('Making vocab') corpus, vocab = encode_documents(documents, ignore=least_common) logging.info('%d tokens read (including <null> and <unk>)', len(vocab)) # estimates parameters T[f,e] = t(f|e) # where (e, f) are syntactic patterns occurring in adjacent sentences in a document T, LL = ibm1(corpus, len(vocab), args.max_iterations, args.min_gain, args.progress) T = np.nan_to_num(T) # store the log-likelihood values if args.ll: with open(args.ll, 'w') as fo: [fo.write('{0}\n'.format(ll)) for ll in LL] # dumps T in a nice format tokens = [t for t, i in sorted(vocab.iteritems(), key=lambda (t, i): i)] V = len(tokens) # we print a header so that the meaning of each column is clear print >> args.output, '#trigger\t#pattern\t#p(pattern|trigger)' # note that e=trigger and f=pattern # we iterate over f in no particular order (simply that of the vocabulary ids) for f in xrange(V): # we iterate over triggers so that the most likely ones come first for e in sorted(itertools.ifilter(lambda e: T[f,e], xrange(V)), key=lambda e: T[f,e], reverse=True): print >> args.output, '{0}\t{1}\t{2}'.format(tokens[e], tokens[f], T[f,e])