示例#1
0
def parse_cube(lang_name, finite):
    """
        reads hypotheses of lang_name, estimates the p/r and posterior score, and saves them into a cube (list of tables)

        data structure:
            stats = [cube, topn]
            cube = [[size, Z, table], [size, Z, table], ...]
            table = [[ind, score, p, r, f, strs], [ind, score, p, r, f, strs], ...]

        NOTE: topn here is a dict, you can use ind to find the h

        example script:
            mpiexec -n 12 python parse_hypothesis.py --mode=parse_cube --language=An --finite=3/10
    """
    _dir = 'out/'
    global size
    global rank
    topn = dict()
    prf_dict = {}
    language = instance(lang_name, finite)

    if rank == 0:
        truncate_flag = False if (lang_name == 'An' and finite <= 3) else True
        set_topn = set()
        print 'loading..'
        fff()
        for file_name in listdir(_dir):
            if lang_name + '_' in file_name:
                _set = load(open(_dir + file_name))
                set_topn.update([h for h in _set])

        print 'getting p&r..'
        fff()
        pr_data = language.sample_data_as_FuncData(2048)
        for h in set_topn:
            p, r, h_llcounts = language.estimate_precision_and_recall(
                h, pr_data, truncate=truncate_flag)
            prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)]
            h.fixed_ll_counts = h_llcounts

        topn = dict(enumerate(set_topn))
        print 'bcasting..'
        fff()

    topn = comm.bcast(topn, root=0)
    prf_dict = comm.bcast(prf_dict, root=0)

    print rank, 'getting posterior'
    fff()
    # work_list = slice_list(np.arange(0, 72, 6), size)
    work_list = slice_list(np.arange(120, 264, 12), size)

    cube = []
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for ind, h in topn.iteritems():
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for ind, h in topn.iteritems()])

        table = [[
            ind, h.posterior_score, prf_dict[h][0], prf_dict[h][1],
            prf_dict[h][2], h.fixed_ll_counts
        ] for ind, h in topn.iteritems()]
        table.sort(key=lambda x: x[1], reverse=True)
        cube += [[s, Z, table]]

        print rank, s, 'done'
        fff()

    if rank == 0:
        for i in xrange(1, size):
            cube += comm.recv(source=i)
    else:
        comm.send(cube, dest=0)
        print rank, 'table sent'
        fff()
        sys.exit(0)

    cube.sort(key=lambda x: x[0])
    dump([cube, topn], open(lang_name + '_stats' + suffix, 'w'))
示例#2
0
    prefix = 'out/'
    # prefix = '/home/lijm/WORK/yuan/lot/'
    suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S',
                           time.localtime())

    # set the output codec -- needed to display lambda to stdout
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    if rank == 0:
        display_option_summary(options)
        fff()

    # you need to run 12 machine on that
    DATA_RANGE = np.arange(0, 70, 6)
    # DATA_RANGE = np.arange(70, 140, 6)

    language = instance(options.LANG, options.FINITE)
    args = list(itertools.product([make_hypothesis], [language], DATA_RANGE))
    # run on MPI
    # results = MPI_map(run, args)
    hypotheses = simple_mpi_map(run, args)

    # ========================================================================================================
    # Get stats
    # ========================================================================================================

    # dump(hypotheses, open(prefix+'hypotheses_'+options.LANG+suffix, 'w'))

    # # get precision and recall for h
    # pr_data = language.sample_data_as_FuncData(1024)
    # p = []
    # r = []
示例#3
0
def make_staged_posterior_seq(_dir, temperature, lang_name, dtype):
    """
        script: python parse_hypothesis.py --mode=make_staged_posterior_seq --file=file --temp=1 --language=AnBn --dtype=staged/uniform

        1. read raw file
        2. compute fixed Counter
        3. compute posterior for different amounts

        dumped posterior format: [topn, [z,amount,finite,[s1,s2,....]], [], [], ....]

        NOTE: if _dir is previously dumped posterior seq, then we use it
    """

    if not (os.path.isfile(_dir) and 'posterior_seq' in _dir):

        topn = set()

        for filename in os.listdir(_dir):
            if ('staged' in filename
                    or 'normal' in filename) and 'seq' not in filename:
                print 'load', filename
                _set = load(open(_dir + filename))
                topn.update([h for h in _set])
        topn = list(topn)

        # fix the llcnts to save time and make curve smooth
        print 'get llcnts...'
        for h in topn:
            llcnts = Counter([h() for _ in xrange(2048)])
            h.fixed_ll_counts = llcnts

        seq = []
        seq.append(topn)

        for amount, finite in mk_staged_wlist(0, 200, 2, [48, 96]):

            print 'posterior on', amount, finite

            if dtype == 'staged':
                language = instance(lang_name, finite)
                eval_data = language.sample_data_as_FuncData(amount)
            elif dtype == 'uniform':
                eval_data = uniform_data(amount, 12)

            for h in topn:
                h.likelihood_temperature = temperature
                h.compute_posterior(eval_data)

            Z = logsumexp([h.posterior_score for h in topn])
            seq.append([Z, amount, finite, [h.posterior_score for h in topn]])

        dump(seq, open(dtype + '_posterior_seq' + suffix, 'w'))

    else:
        seq = load(open(_dir))

    # ====================== compute KL based on seq =======================

    print 'compute kl seq...'
    kl_seq = []
    topn = seq.pop(0)
    for i in xrange(len(seq) - 1):
        kl_seq.append([seq[i][1], compute_kl2(seq[i], seq[i + 1])])

    dump(kl_seq, open(dtype + '_kl_seq' + suffix, 'w'))
示例#4
0
def parse_plot(lang_name, finite, is_plot):
    """
        run: mpi supported

        example:
            mpiexec -n 12 python parse_hypothesis.py --mode=parse_plot --language=An --finite=3 --plot=yes --wfs=yes
    """
    _dir = 'out/final/'
    global size
    global rank
    topn = set()
    prf_dict = {}
    language = instance(lang_name, finite)

    if rank == 0:
        print 'loading..'
        fff()
        for file_name in listdir(_dir):
            if lang_name + '_' in file_name:
                _set = load(open(_dir + file_name))
                topn.update([h for h in _set])

        print 'getting p&r..'
        fff()
        pr_data = language.sample_data_as_FuncData(1024)
        for h in topn:
            p, r = language.estimate_precision_and_recall(h, pr_data)
            prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)]

        dump(prf_dict, open(lang_name + '_prf_dict' + suffix, 'w'))

    topn = comm.bcast(topn, root=0)
    prf_dict = comm.bcast(prf_dict, root=0)

    print rank, 'getting posterior'
    fff()
    work_list = slice_list(np.arange(235, 300, 5), size)
    seq = []

    pnt_str = 'Weighted F-score' if options.WFS == 'yes' else 'Posterior Probability'
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for h in topn:
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for h in topn])

        if options.WFS == 'yes':
            tmp = sum(
                [prf_dict[h][2] * np.exp(h.posterior_score - Z) for h in topn])
            # TODO
            # else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if prf_dict[h][2] > 0.9])
        else:
            tmp = sum([
                np.exp(h.posterior_score - Z) for h in topn
                if (prf_dict[h][0] < 0.3 and prf_dict[h][1] > 0.9)
            ])

        if options.PROB == 'yes':
            dump([topn, Z], open(lang_name + '_prob_' + str(s) + suffix, 'w'))

        seq.append([s, tmp])
        print 'size: %.1f' % s, '%s: %.2f' % (pnt_str, tmp)
        fff()

        #debug
        _list = [h for h in topn]
        _list.sort(key=lambda x: x.posterior_score, reverse=True)
        for i in xrange(3):
            print 'prob: ', np.exp(_list[i].posterior_score -
                                   Z), 'p,r: ', prf_dict[_list[i]][:2],
            print Counter([_list[i]() for _ in xrange(256)])
            print _list[i]
        print '=' * 50
        fff()

    if rank == 0:
        for i in xrange(1, size):
            seq += comm.recv(source=i)
    else:
        comm.send(seq, dest=0)
        sys.exit(0)

    seq.sort(key=lambda x: x[0])
    dump(seq, open(lang_name + '_seq' + suffix, 'w'))

    if is_plot == 'yes':
        x, y = zip(*seq)
        plt.plot(x, y)

        plt.ylabel(pnt_str)
        plt.xlabel('Size of Data')
        plt.title(lang_name)
        plt.show()
示例#5
0
def parse_cube(lang_name, finite):
    """
        reads hypotheses of lang_name, estimates the p/r and posterior score, and saves them into a cube (list of tables)

        data structure:
            stats = [cube, topn]
            cube = [[size, Z, table], [size, Z, table], ...]
            table = [[ind, score, p, r, f, strs], [ind, score, p, r, f, strs], ...]

        NOTE: topn here is a dict, you can use ind to find the h

        example script:
            mpiexec -n 12 python parse_hypothesis.py --mode=parse_cube --language=An --finite=3/10
    """
    _dir = 'out/'
    global size
    global rank
    topn = dict()
    prf_dict = {}
    language = instance(lang_name, finite)

    if rank == 0:
        truncate_flag = False if (lang_name == 'An' and finite <= 3) else True
        set_topn = set()
        print 'loading..'; fff()
        for file_name in listdir(_dir):
            if lang_name + '_' in file_name:
                _set = load(open(_dir+file_name))
                set_topn.update([h for h in _set])

        print 'getting p&r..'; fff()
        pr_data = language.sample_data_as_FuncData(2048)
        for h in set_topn:
            p, r, h_llcounts = language.estimate_precision_and_recall(h, pr_data, truncate=truncate_flag)
            prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)]
            h.fixed_ll_counts = h_llcounts

        topn = dict(enumerate(set_topn))
        print 'bcasting..'; fff()

    topn = comm.bcast(topn, root=0)
    prf_dict = comm.bcast(prf_dict, root=0)

    print rank, 'getting posterior'; fff()
    # work_list = slice_list(np.arange(0, 72, 6), size)
    work_list = slice_list(np.arange(120, 264, 12), size)

    cube = []
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for ind, h in topn.iteritems():
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for ind, h in topn.iteritems()])

        table = [[ind, h.posterior_score, prf_dict[h][0], prf_dict[h][1], prf_dict[h][2], h.fixed_ll_counts] for ind, h in topn.iteritems()]
        table.sort(key=lambda x: x[1], reverse=True)
        cube += [[s, Z, table]]

        print rank, s, 'done'; fff()

    if rank == 0:
        for i in xrange(1, size):
            cube += comm.recv(source=i)
    else:
        comm.send(cube, dest=0)
        print rank, 'table sent'; fff()
        sys.exit(0)

    cube.sort(key=lambda x: x[0])
    dump([cube, topn], open(lang_name+'_stats'+suffix, 'w'))
示例#6
0
            mpiexec -n 12 python my_search_stp.py --language=Dyck --finite=8 --N=2 --terminal=b --bound=7 --steps=100000
            mpiexec -n 12 python my_search_stp.py --language=SimpleEnglish --finite=8 --N=3 --bound=5 --steps=100000
    """
    # ========================================================================================================
    # Process command line arguments /
    # ========================================================================================================
    fff = sys.stdout.flush
    parser = OptionParser()
    parser.add_option("--language", dest="LANG", type="string", default='An', help="name of a language")
    parser.add_option("--steps", dest="STEPS", type="int", default=40000, help="Number of samples to run")
    parser.add_option("--top", dest="TOP_COUNT", type="int", default=20, help="Top number of hypotheses to store")
    parser.add_option("--finite", dest="FINITE", type="int", default=10, help="specify the max_length to make language finite")
    parser.add_option("--name", dest="NAME", type="string", default='', help="name of file")
    parser.add_option("--N", dest="N", type="int", default=3, help="number of inner hypotheses")
    parser.add_option("--terminal", dest="TERMINALS", type="string", default='', help="extra terminals")
    parser.add_option("--bound", dest="BOUND", type="int", default=5, help="recursion bound")
    (options, args) = parser.parse_args()

    prefix = 'out/'
    # prefix = '/home/lijm/WORK/yuan/lot/'
    suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime())

    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    if rank == 0: display_option_summary(options); fff()

    DATA_RANGE = np.arange(0, 70, 6)

    language = instance(options.LANG, options.FINITE)
    args = list(itertools.product([make_hypothesis], [language], DATA_RANGE))

    hypotheses = simple_mpi_map(run, args)
示例#7
0
def parse_plot(lang_name, finite, is_plot):
    """
        run: mpi supported

        example:
            mpiexec -n 12 python parse_hypothesis.py --mode=parse_plot --language=An --finite=3 --plot=yes --wfs=yes
    """
    _dir = 'out/final/'
    global size
    global rank
    topn = set()
    prf_dict = {}
    language = instance(lang_name, finite)

    if rank == 0:
        print 'loading..'; fff()
        for file_name in listdir(_dir):
            if lang_name + '_' in file_name:
                _set = load(open(_dir+file_name))
                topn.update([h for h in _set])

        print 'getting p&r..'; fff()
        pr_data = language.sample_data_as_FuncData(1024)
        for h in topn:
            p, r = language.estimate_precision_and_recall(h, pr_data)
            prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)]

        dump(prf_dict, open(lang_name+'_prf_dict'+suffix, 'w'))

    topn = comm.bcast(topn, root=0)
    prf_dict = comm.bcast(prf_dict, root=0)

    print rank, 'getting posterior'; fff()
    work_list = slice_list(np.arange(235, 300, 5), size)
    seq = []

    pnt_str = 'Weighted F-score' if options.WFS == 'yes' else 'Posterior Probability'
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for h in topn:
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for h in topn])

        if options.WFS == 'yes': tmp = sum([prf_dict[h][2]*np.exp(h.posterior_score - Z) for h in topn])
        # TODO
        # else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if prf_dict[h][2] > 0.9])
        else: tmp = sum([np.exp(h.posterior_score - Z) for h in topn if (prf_dict[h][0] < 0.3 and prf_dict[h][1] > 0.9)])

        if options.PROB == 'yes': dump([topn, Z], open(lang_name+'_prob_'+str(s)+suffix, 'w'))

        seq.append([s, tmp])
        print 'size: %.1f' % s, '%s: %.2f' % (pnt_str, tmp); fff()

        #debug
        _list = [h for h in topn]; _list.sort(key=lambda x: x.posterior_score, reverse=True)
        for i in xrange(3):
            print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2],
            print Counter([_list[i]() for _ in xrange(256)])
            print _list[i]
        print '='*50; fff()

    if rank == 0:
        for i in xrange(1, size):
            seq += comm.recv(source=i)
    else:
        comm.send(seq, dest=0)
        sys.exit(0)

    seq.sort(key=lambda x: x[0])
    dump(seq, open(lang_name+'_seq'+suffix, 'w'))

    if is_plot == 'yes':
        x, y = zip(*seq)
        plt.plot(x, y)

        plt.ylabel(pnt_str)
        plt.xlabel('Size of Data')
        plt.title(lang_name)
        plt.show()
示例#8
0
def make_staged_posterior_seq(_dir, temperature, lang_name, dtype):
    """
        script: python parse_hypothesis.py --mode=make_staged_posterior_seq --file=file --temp=1 --language=AnBn --dtype=staged/uniform

        1. read raw file
        2. compute fixed Counter
        3. compute posterior for different amounts

        dumped posterior format: [topn, [z,amount,finite,[s1,s2,....]], [], [], ....]

        NOTE: if _dir is previously dumped posterior seq, then we use it
    """
    
    if not (os.path.isfile(_dir) and 'posterior_seq' in _dir):

        topn = set()

        for filename in os.listdir(_dir):
            if ('staged' in filename or 'normal' in filename) and 'seq' not in filename:
                print 'load', filename
                _set = load(open(_dir+filename))
                topn.update([h for h in _set])
        topn = list(topn)

        # fix the llcnts to save time and make curve smooth
        print 'get llcnts...'
        for h in topn:
            llcnts = Counter([h() for _ in xrange(2048)])
            h.fixed_ll_counts = llcnts
            
            
        seq = []
        seq.append(topn)

        for amount, finite in mk_staged_wlist(0,200,2,[48,96]):
            
            print 'posterior on', amount, finite
            
            if dtype == 'staged':
                language = instance(lang_name, finite)
                eval_data = language.sample_data_as_FuncData(amount)
            elif dtype == 'uniform':
                eval_data = uniform_data(amount, 12)

            for h in topn:
                h.likelihood_temperature = temperature
                h.compute_posterior(eval_data)

            Z = logsumexp([h.posterior_score for h in topn])
            seq.append([Z, amount, finite, [h.posterior_score for h in topn]])

        dump(seq,open(dtype + '_posterior_seq' + suffix, 'w'))

    else:
        seq = load(open(_dir))
    

    # ====================== compute KL based on seq =======================
    
    print 'compute kl seq...'
    kl_seq = []
    topn = seq.pop(0)
    for i in xrange(len(seq)-1):
        kl_seq.append([seq[i][1],compute_kl2(seq[i],seq[i+1])])

    dump(kl_seq,open(dtype + '_kl_seq' + suffix,'w'))
示例#9
0
def parse_plot(lang_name, finite, is_plot):
    """
        run: mpi supported
    """
    _dir = 'out/final/'
    global size
    global rank

    print 'loading..'
    fff()
    topn = set()
    for file_name in listdir(_dir):
        if lang_name in file_name:
            _set = load(open(_dir + file_name))
            topn.update([h for h in _set])

    language = instance(lang_name, finite)

    print 'getting p&r..'
    fff()
    prf_dict = {}
    pr_data = language.sample_data_as_FuncData(1024)
    for h in topn:
        p, r = language.estimate_precision_and_recall(h, pr_data)
        prf_dict[h] = [p, r, 0 if p + r == 0 else 2 * p * r / (p + r)]

    if rank == 0:
        dump(prf_dict, open(lang_name + '_prf_dict' + suffix, 'w'))

    print rank, 'getting posterior'
    fff()
    work_list = slice_list(np.arange(0, 30, 1), size)
    seq = []
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for h in topn:
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for h in topn])
        wfs = sum(
            [prf_dict[h][2] * np.exp(h.posterior_score - Z) for h in topn])
        seq.append([s, wfs])
        print 'size: %.1f' % s, 'weighted F-score: %.2f' % wfs
        fff()

        #debug
        _list = [h for h in topn]
        _list.sort(key=lambda x: x.posterior_score, reverse=True)
        for i in xrange(3):
            print 'prob: ', np.exp(_list[i].posterior_score -
                                   Z), 'p,r: ', prf_dict[_list[i]][:2],
            print Counter([_list[i]() for _ in xrange(256)])
            print _list[i]
        print '=' * 50
        fff()

    if rank == 0:
        for i in xrange(1, size):
            seq += comm.recv(source=i)
    else:
        comm.send(seq, dest=0)
        sys.exit(0)

    seq.sort(key=lambda x: x[0])
    dump(seq, open(lang_name + '_seq' + suffix, 'w'))

    if is_plot == 'yes':
        x, y = zip(*seq)
        plt.plot(x, y)

        plt.ylabel('Weighted F-score')
        plt.xlabel('Size of Data')
        plt.title(lang_name)
        plt.show()
示例#10
0
def parse_plot(lang_name, finite, is_plot):
    """
        run: mpi supported
    """
    _dir = 'out/final/'
    global size
    global rank

    print 'loading..'; fff()
    topn = set()
    for file_name in listdir(_dir):
        if lang_name in file_name:
            _set = load(open(_dir+file_name))
            topn.update([h for h in _set])

    language = instance(lang_name, finite)

    print 'getting p&r..'; fff()
    prf_dict = {}
    pr_data = language.sample_data_as_FuncData(1024)
    for h in topn:
        p, r = language.estimate_precision_and_recall(h, pr_data)
        prf_dict[h] = [p, r, 0 if p+r == 0 else 2*p*r/(p+r)]

    if rank == 0:
        dump(prf_dict, open(lang_name+'_prf_dict'+suffix, 'w'))

    print rank, 'getting posterior'; fff()
    work_list = slice_list(np.arange(0, 30, 1), size)
    seq = []
    for s in work_list[rank]:
        eval_data = language.sample_data_as_FuncData(s)
        for h in topn:
            h.likelihood_temperature = 100
            h.compute_posterior(eval_data)

        Z = logsumexp([h.posterior_score for h in topn])
        wfs = sum([prf_dict[h][2]*np.exp(h.posterior_score - Z) for h in topn])
        seq.append([s, wfs])
        print 'size: %.1f' % s, 'weighted F-score: %.2f' % wfs; fff()

        #debug
        _list = [h for h in topn]; _list.sort(key=lambda x: x.posterior_score, reverse=True)
        for i in xrange(3):
            print 'prob: ', np.exp(_list[i].posterior_score - Z), 'p,r: ', prf_dict[_list[i]][:2],
            print Counter([_list[i]() for _ in xrange(256)])
            print _list[i]
        print '='*50; fff()

    if rank == 0:
        for i in xrange(1, size):
            seq += comm.recv(source=i)
    else:
        comm.send(seq, dest=0)
        sys.exit(0)

    seq.sort(key=lambda x: x[0])
    dump(seq, open(lang_name+'_seq'+suffix, 'w'))

    if is_plot == 'yes':
        x, y = zip(*seq)
        plt.plot(x, y)

        plt.ylabel('Weighted F-score')
        plt.xlabel('Size of Data')
        plt.title(lang_name)
        plt.show()
示例#11
0
    parser.add_option("--top", dest="TOP_COUNT", type="int", default=20, help="Top number of hypotheses to store")
    parser.add_option("--finite", dest="FINITE", type="int", default=10, help="specify the max_length to make language finite")
    parser.add_option("--name", dest="NAME", type="string", default='', help="name of file")
    (options, args) = parser.parse_args()

    suffix = time.strftime('_' + options.NAME + '_%m%d_%H%M%S', time.localtime())

    # set the output codec -- needed to display lambda to stdout
    sys.stdout = codecs.getwriter('utf8')(sys.stdout)
    if is_master_process():
        display_option_summary(options); fff()

    # you need to run 12 machine on that
    DATA_RANGE = np.arange(1, 64, 6)

    language = instance(options.LANG)
    args = list(itertools.product([make_hypothesis], [language], DATA_RANGE, [options.FINITE]))
    # run on MPI
    results = MPI_map(run, args)

    # ========================================================================================================
    # Get stats
    # ========================================================================================================
    # collapse all returned sets
    hypotheses = set()
    for r in results:
        hypotheses.update(r)  # add the ith's results to the set

    dump(hypotheses, open('hypotheses'+suffix, 'w'))

    # get precision and recall for h