def decode_many(unigrams, bigrams, salience, ipaths, opaths, jobs, estream=sys.stderr): # reads in the model logging.info('Reading unigrams from: %s', unigrams) U = read_unigrams(smart_open(unigrams), r2i) logging.info('Read in %d unigrams', U.size) logging.info('Reading bigrams from: %s', bigrams) B = read_bigrams(smart_open(bigrams), r2i) logging.info('Read in %d bigrams', B.size) tests = [None] * len(ipaths) for i, ipath in enumerate(ipaths): documents = read_grids(smart_open(ipath), r2i) logging.info('%s: %d test documents read', ipath, len(documents)) tests[i] = documents # computes the log likelihood of each document in each test file pool = Pool(jobs) all_L = pool.map(partial(wrapped_loglikelihood, U=U, B=B, salience=salience), tests) print >> estream, '#file\t#sum\t#mean' for ipath, opath, test, L in itertools.izip(ipaths, opaths, tests, all_L): with smart_open(opath, 'w') as ostream: # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#entities' for i, ll in enumerate(L): num_sentences = test[i].shape[0] num_entities = test[i].shape[1] print >> ostream, '{0}\t{1}\t{2}\t{3}'.format(i, ll, num_sentences, num_entities) print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
def decode_many(model, ipaths, opaths, jobs, estream=sys.stderr): # reads in the model logging.info('Loading model: %s', model) T, vocab = load_model(model) logging.info('%d patterns and %d entries', len(vocab), T.size) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) tests = [None] * len(ipaths) for i, ipath in enumerate(ipaths): documents = read_documents(smart_open(ipath), boundaries) logging.info('%s: %d test documents read', ipath, len(documents)) # encode test documents using the model's vocabulary tests[i] = encode_test_documents(documents, vocab) # computes the log likelihood of each document in each test file pool = Pool(jobs) all_L = pool.map(partial(wrapped_loglikelihood, T=T), tests) print >> estream, '#file\t#sum\t#mean' for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L): with smart_open(opath, 'w') as ostream: # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, ll/num_sentences, num_patterns, ll/num_patterns) print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
def decode_many(unigrams, bigrams, c, ipaths, opaths, jobs, estream=sys.stderr): # reads in the model logging.info('Loading model: %s and %s', unigrams, bigrams) U, B, vocab = load_model(unigrams, bigrams) logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0]) # detect whether document boundary tokens were used in the model boundaries = '<doc>' in vocab # detect whether insertion was swtiched insertion = B[0, :].sum() > 0 # reads in the test documents logging.info('Reading test documents in (boundaries=%s) ...', boundaries) tests = [None] * len(ipaths) for i, ipath in enumerate(ipaths): documents = read_documents(smart_open(ipath), boundaries) logging.info('%s: %d test documents read', ipath, len(documents)) # encode test documents using the model's vocabulary tests[i] = encode_test_documents(documents, vocab) # computes the log likelihood of each document in each test file pool = Pool(jobs) all_L = pool.map( partial(wrapped_loglikelihood, U=U, B=B, c=c, insertion=insertion), tests) print >> estream, '#file\t#sum\t#mean' for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L): with smart_open(opath, 'w') as ostream: # dumps scores print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised' for i, ll in enumerate(L): num_sentences = len(test[i]) num_patterns = sum(len(row) for row in test[i]) print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format( i, ll, num_sentences, ll / num_sentences, num_patterns, ll / num_patterns) print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
done = frozenset( os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus))) logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus)) missing = todo - done return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [ ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees ] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """
def file_check(corpus, input_dir, output_dir): todo = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(input_dir, corpus))) logging.info('%d files matching %s', len(todo), '{0}/{1}*'.format(input_dir, corpus)) done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus))) logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus)) missing = todo - done return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """ logging.info('Extracting d-sequences for: %s', corpus) input_dir = namespace.trees output_dir = namespace.dseqs