Пример #1
0
def decode_many(unigrams, bigrams, salience, ipaths, opaths, jobs, estream=sys.stderr):

    # reads in the model
    logging.info('Reading unigrams from: %s', unigrams)
    U = read_unigrams(smart_open(unigrams), r2i)
    logging.info('Read in %d unigrams', U.size)

    logging.info('Reading bigrams from: %s', bigrams)
    B = read_bigrams(smart_open(bigrams), r2i)
    logging.info('Read in %d bigrams', B.size)
    
    
    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_grids(smart_open(ipath), r2i)
        logging.info('%s: %d test documents read', ipath, len(documents))
        tests[i] = documents 

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(partial(wrapped_loglikelihood, U=U, B=B, salience=salience), tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in itertools.izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#entities'
            for i, ll in enumerate(L):
                num_sentences = test[i].shape[0]
                num_entities = test[i].shape[1]
                print >> ostream, '{0}\t{1}\t{2}\t{3}'.format(i, ll, num_sentences, num_entities)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
Пример #2
0
def decode_many(model, ipaths, opaths, jobs, estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s', model)
    T, vocab = load_model(model)
    logging.info('%d patterns and %d entries', len(vocab), T.size)

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab

    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)

    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_documents(smart_open(ipath), boundaries)  
        logging.info('%s: %d test documents read', ipath, len(documents))
        # encode test documents using the model's vocabulary
        tests[i] = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(partial(wrapped_loglikelihood, T=T), tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
            for i, ll in enumerate(L):
                num_sentences = len(test[i])
                num_patterns = sum(len(row) for row in test[i])
                print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(i, ll, num_sentences, 
                        ll/num_sentences, num_patterns, ll/num_patterns)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
Пример #3
0
def decode_many(unigrams, bigrams, salience, ipaths, opaths, jobs, estream=sys.stderr):

    # reads in the model
    logging.info('Reading unigrams from: %s', unigrams)
    U = read_unigrams(smart_open(unigrams), r2i)
    logging.info('Read in %d unigrams', U.size)

    logging.info('Reading bigrams from: %s', bigrams)
    B = read_bigrams(smart_open(bigrams), r2i)
    logging.info('Read in %d bigrams', B.size)
    
    
    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_grids(smart_open(ipath), r2i)
        logging.info('%s: %d test documents read', ipath, len(documents))
        tests[i] = documents 

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(partial(wrapped_loglikelihood, U=U, B=B, salience=salience), tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in itertools.izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#entities'
            for i, ll in enumerate(L):
                num_sentences = test[i].shape[0]
                num_entities = test[i].shape[1]
                print >> ostream, '{0}\t{1}\t{2}\t{3}'.format(i, ll, num_sentences, num_entities)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(), np.mean(L))
Пример #4
0
def decode_many(unigrams,
                bigrams,
                c,
                ipaths,
                opaths,
                jobs,
                estream=sys.stderr):

    # reads in the model
    logging.info('Loading model: %s and %s', unigrams, bigrams)
    U, B, vocab = load_model(unigrams, bigrams)
    logging.info('%d unigrams and %d bigrams', U.shape[0], B.shape[0])

    # detect whether document boundary tokens were used in the model
    boundaries = '<doc>' in vocab
    # detect whether insertion was swtiched
    insertion = B[0, :].sum() > 0

    # reads in the test documents
    logging.info('Reading test documents in (boundaries=%s) ...', boundaries)

    tests = [None] * len(ipaths)
    for i, ipath in enumerate(ipaths):
        documents = read_documents(smart_open(ipath), boundaries)
        logging.info('%s: %d test documents read', ipath, len(documents))
        # encode test documents using the model's vocabulary
        tests[i] = encode_test_documents(documents, vocab)

    # computes the log likelihood of each document in each test file
    pool = Pool(jobs)
    all_L = pool.map(
        partial(wrapped_loglikelihood, U=U, B=B, c=c, insertion=insertion),
        tests)

    print >> estream, '#file\t#sum\t#mean'
    for ipath, opath, test, L in izip(ipaths, opaths, tests, all_L):
        with smart_open(opath, 'w') as ostream:
            # dumps scores
            print >> ostream, '#doc\t#logprob\t#sentences\t#s_normalised\t#patterns\t#p_normalised'
            for i, ll in enumerate(L):
                num_sentences = len(test[i])
                num_patterns = sum(len(row) for row in test[i])
                print >> ostream, '{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(
                    i, ll, num_sentences, ll / num_sentences, num_patterns,
                    ll / num_patterns)
            print >> estream, '{0}\t{1}\t{2}'.format(opath, L.sum(),
                                                     np.mean(L))
Пример #5
0
    done = frozenset(
        os.path.basename(path)
        for path in glob('{0}/{1}*'.format(output_dir, corpus)))
    logging.info('%d files matching %s', len(done),
                 '{0}/{1}*'.format(output_dir, corpus))
    missing = todo - done
    return todo, done, missing


def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [
                ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees
            ]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))


def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """
Пример #6
0
def file_check(corpus, input_dir, output_dir):
    todo = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(input_dir, corpus)))
    logging.info('%d files matching %s', len(todo), '{0}/{1}*'.format(input_dir, corpus))
    done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus)))
    logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus))
    missing = todo - done
    return todo, done, missing

def wrap_dseqs((i, ipath, opath), depth, **kwargs):
    """
    Wrap a call to dseqs. To be used with Pool.map.
    """
    try:
        logging.info('(%d) %s ', i, ipath)
        fi = smart_open(ipath, 'r')
        fo = smart_open(opath, 'w')
        for trees, attrs in iterdoctext(fi):
            sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees]
            writedoctext(fo, sequences, **attrs)
    except:
        raise Exception(''.join(traceback.format_exception(*sys.exc_info())))

def extract_dseqs(corpus, args, namespace, **kwargs):
    """
    Extracts dsequences for a certain corpus
    """

    logging.info('Extracting d-sequences for: %s', corpus)
    input_dir = namespace.trees
    output_dir = namespace.dseqs