示例#1
0
def make_data(n=1, alpha=0.9, dataset=['A']):
    data = []
    if 'A' in dataset:
        data.append([
            FunctionData(input=[
                Obj(shape='rhombus', color='cinnabar', size='miniature')
            ],
                         output=False,
                         alpha=alpha),
            FunctionData(input=[
                Obj(shape='pentagon', color='viridian', size='colossal')
            ],
                         output=True,
                         alpha=alpha)
        ] * n)
    if 'B' in dataset:
        data.append([
            FunctionData(input=[
                Obj(shape='rhombus', color='cinnabar', size='miniature')
            ],
                         output=False,
                         alpha=alpha),
            FunctionData(input=[
                Obj(shape='dodecahedron',
                    color='cerulean',
                    size='intermediate')
            ],
                         output=True,
                         alpha=alpha)
        ] * n)
    return data
示例#2
0
def make_data(alpha=0.99, size=1):
    # here just doubling x :-> cons(x,x)
    return [
        FunctionData(
            input=[[]],
            output=[[], []],
            alpha=alpha,
        ),
        FunctionData(
            input=[[[]]],
            output=[[[]], [[]]],
            alpha=alpha
        )
    ] * size
def some_stuff():


    #simulate some data with high probability of
    #center embedding
    #and a lower probability of tail recursion
    lst = ["(", "[", "]", ")"]
    randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")]
    d = simulateData(lst, randomTstLists, p=1.0, N=12)

    for k in d.keys():
        if d[k] > 0.0:
            print k, d[k]
    print len(d)

    #print isValidCenterEmbed(("(", "]"))
    #print isValidCenterEmbed(('(', '(', ')', ']'))

    data = [ FunctionData(input=(), output=d, alpha=0.9) ]
    h0 = MyHypothesis()
    from numpy import exp
    for h in MHSampler(h0, data, steps=5000):
        None
        #exp(h.compute_likelihood(data)) > 0.0 or
        y = h()
        #print h, y
        if isValidCenterEmbed(y):
            #print "hello"
            #None
            print exp(h.compute_posterior(data)), h, y#, isValidCenterEmbed(y)
        #else:
           # print h.compute_likelihood(data), h, y
示例#4
0
def get_kl_seq():
    """
    1. read posterior sequences
    2. compute KL-divergence between adjacent distribution
    3. plot

    run: serial
    """
    print 'loading..'
    fff()
    seq_set = [
        load(open('seq0_0825_234606')),
        load(open('seq1_0825_234606')),
        load(open('seq2_0825_234606'))
    ]
    # seq_set = load(open('seq0_0825_195540')) + load(open('seq1_0825_195540')) + load(open('seq2_0825_195540'))
    kl_seq_set = []

    print 'compute prior..'
    fff()
    # add posterior set that without observing any data
    from copy import deepcopy
    dict_0 = deepcopy(seq_set[0][0])
    for h in dict_0:
        dict_0[h] = h.compute_posterior(
            [FunctionData(input=[], output=Counter())])

    print 'making plot..'
    fff()
    for seq in seq_set:
        kl_seq = []

        # # add posterior set that without observing any data
        # from copy import deepcopy
        # dict_0 = deepcopy(seq[0])
        # for h in dict_0:
        #     dict_0[h] = h.compute_posterior([FunctionData(input=[], output=Counter())])
        seq.insert(0, dict_0)

        # compute kl
        for i in xrange(len(seq) - 1):
            current_dict = seq[i]
            next_dict = seq[i + 1]
            kl = compute_kl(current_dict, next_dict)
            kl_seq.append(log(kl))
            print 'KL from %i to %i: ' % (i, i + 1), kl
            fff()

        kl_seq_set.append(kl_seq)
        print '=' * 50
        fff()

    staged, = plt.plot(range(12, 145, 2), kl_seq_set[0], label='staged')
    normal, = plt.plot(range(12, 145, 2), kl_seq_set[1], label='normal')
    uniform, = plt.plot(range(12, 145, 2), kl_seq_set[2], label='uniform')

    plt.legend(handles=[normal, staged, uniform])
    plt.ylabel('KL-divergence')
    plt.xlabel('data')
    plt.show()
示例#5
0
def import_pd_data(fname):
    import pandas as pd
    from collections import defaultdict

    df = pd.read_pickle(fname)

    grouped = df.groupby(['concept', 'target'], as_index=False)
    data = defaultdict(lambda: FunctionData(input=[], output={}))

    for (c, t), group in grouped:
        y = sum(group['rating'])
        n = len(group['rating']) - y

        try:
            concept = list(eval(c))
        except:
            concept = [eval(c)]
        target = eval(t)

        data[c].input = concept
        data[c].output[target] = (y, n)

    return data.values()


# josh_data = import_josh_data()
示例#6
0
def import_josh_data(path=None):
    """Script for loading Joshs' number game data.

    Data is originally in probability (i.e. float) format, so (# yes, # no) pairs are estimated by
    assuming 20 human participants.

    """
    import os
    from scipy.io import loadmat

    if path is None:
        path = os.getcwd()
    mat = loadmat(path + '/number_game_data.mat')
    mat_data = mat['data']
    number_game_data = []

    for d in mat_data:

        input_data = d[0][0].tolist()
        output_data = {}

        for i in range(len(d[1][0])):
            key = d[1][0][i]
            associated_prob = d[2][0][i]
            associated_yes = int(associated_prob * 20)
            output_data[key] = (associated_yes, 20 - associated_yes
                                )  # est. (# yes, # no) responses

        function_datum = FunctionData(input=input_data, output=output_data)
        number_game_data.append(function_datum)
    return number_game_data
示例#7
0
    def sample_data(self, n):
        # Sample a string of data
        cnt = Counter()
        for _ in xrange(n):
            cnt[self.sample_string()] += 1

        return [FunctionData(input=[], output=cnt, alpha=self.ALPHA)]
示例#8
0
def load_words_and_data(path):
    """
    Takes a data path and return [words, data]
    """

    # Load the data
    data = []
    with open(path, 'r') as f:
        for l in f:
            if re.match(r"\s*#", l): continue  # skip comments
            if not re.match(r"[^\s]", l): continue  # skip whitespace

            lhs, output = re.split(r"\s*=\s*", l.strip())
            args = re.split(r"\s+", lhs)

            data.append(FunctionData(input=args, output=output))
            #print "Loading data", args, "->", output

    # Figure out all the words! (here, tokens)
    words = set()
    for di in data:
        words.add(di.output)
        [words.add(x) for x in di.input]
    words = list(words)

    return [words, data]
示例#9
0
def run(save_file, alpha, iters, propose_scale, propose_n, skip, summary_cap):
    # Faux data
    data = [
        HumanData(
            data=FunctionData(input=[2,4,6,8], output=[]),
            queries=(1, 20, 30, 48, 80, 99),
            responses=((1, 19), (17, 3), (15, 5), (19, 1), (20, 0), (2, 18))
        ),
        HumanData(
            data=FunctionData(input=[10, 40], output=[]),
            queries=(1, 20, 30, 48, 80, 99),
            responses=((1, 19), (20, 0), (20, 0), (2, 18), (19, 1), (2, 18))
        )
    ]


    # Enumerate all 'domain level' hypotheses generated by our grammar
    hypotheses = []
    for fn in simple_grammar.enumerate(d=10):
        h = NumberGameHypothesis(grammar=simple_grammar, domain=100, alpha=alpha)
        h.set_value(fn)
        h.compute_prior()
        hypotheses.append(h)

    grammar_h0 = GrammarHypothesisVectorized(simple_grammar, hypotheses,
                                             propose_scale=propose_scale, propose_n=propose_n)
    mh_grammar_sampler = MHSampler(grammar_h0, data, iters)
    mh_grammar_summary = VectorSummary(skip=skip, cap=summary_cap)

    print '^*'*60, '\nGenerating GrammarHypothesis Samples\n', '^*'*60

    # Initialize csv file
    mh_grammar_summary.csv_initfiles(save_file)

    # Sample GrammarHypotheses!
    for i, gh in enumerate(mh_grammar_summary(mh_grammar_sampler)):
        if (i % 10 == 0):
            print i, " ITERATIONS"
            print '\n', '#'*100

        # Save to CSV & print grammar rule values
        if (i % skip == 0):
            mh_grammar_summary.csv_appendfiles(save_file, data)
            for idx in grammar_h0.get_propose_idxs():
                print idx, '\t|  ', grammar_h0.rules[idx]

    mh_grammar_summary.pickle_summary(filename=save_file + '_summary.p')
示例#10
0
    def sample_data_as_FuncData(self, n, avg=True):
        """
        n: can be float in avg mode
        finite: limits the max_length of data
        avg: sample for multiple times and average to reduce noise, note the cnt can have fraction
        """
        if n == 0:
            return [FunctionData(input=[], output=Counter())]

        if avg:
            cnt = Counter(self.sample_data(int(n * 512)))
            n = float(512)
            for key in cnt.keys():
                cnt[key] /= n
            return [FunctionData(input=[], output=cnt)]

        return [FunctionData(input=[], output=Counter(self.sample_data(n)))]
示例#11
0
    def sample_data(self, n):
        # Sample a string of data
        cnt = Counter()
        for _ in xrange(n):
            s = str(self.grammar.generate())
            cnt[s] += 1

        return [FunctionData(input=[0], output=cnt)]
示例#12
0
文件: Model.py 项目: wrongu/LOTlib
def make_data(n=1):
    return [
        FunctionData(
            input=[{Obj(color='red'),
                    Obj(color='red'),
                    Obj(color='green')}],
            output=True,
            alpha=0.99)
    ] * n
示例#13
0
def make_data(size=10, alpha=0.99):
    # Replicate all data N times
    return [FunctionData(['A', Obj(shape='square', color='red')], True, alpha=alpha),
            FunctionData(['A', Obj(shape='square', color='blue')], False, alpha=alpha),
            FunctionData(['A', Obj(shape='triangle', color='blue')], False, alpha=alpha),
            FunctionData(['A', Obj(shape='triangle', color='red')], False, alpha=alpha),

            FunctionData(['B', Obj(shape='square', color='red')], False, alpha=alpha),
            FunctionData(['B', Obj(shape='square', color='blue')], True, alpha=alpha),
            FunctionData(['B', Obj(shape='triangle', color='blue')], True, alpha=alpha),
            FunctionData(['B', Obj(shape='triangle', color='red')], True, alpha=alpha)] * size
示例#14
0
文件: Model.py 项目: flrgsr/LOTlib
def make_data(size=50):
    return [
        FunctionData(input=[],
                     output={
                         'N V': size,
                         'D N V': size,
                         'D N V N': size,
                         'D N V D N': size
                     })
    ]
示例#15
0
def make_data(n=1, target=F1, data_size=100, sd=0.1):

    # initialize the data
    data = []
    for i in range(data_size):
        x = random()
        y = target(x) + normal() * sd
        data.append(FunctionData(input=[x], output=y, ll_sd=sd))

    return data * n
def get_top_N(pair1, pair2):
    priors = {}
    complete = 0
    for p1 in pair1:
        for p2 in pair2:
            data = [
                FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)})
            ]
            h0 = MyHypothesis()
            top_hyps = set()
            seen = set()

            chains = 0
            while ((len(seen) < n_top and chains < max_chains)
                   or (len(seen) < 3)):
                chains += 1
                x = 0
                for h in MHSampler(h0,
                                   data,
                                   steps=steps,
                                   acceptance_temperature=acc_temp):
                    #print y
                    out = h(p1)[:len(p2)]

                    str_h = str(h)
                    if len(out) == len(p2) and hamming_distance(out, p2) == 0:
                        if str_h not in seen:  #and "from" not in str_h[14:]:
                            top_hyps.add((copy.deepcopy(h), h.prior))
                            seen.add(str_h)

                    if x % 1000 == 0:
                        print_star(x, h, out, p2, h.value.get_rule_signature(),
                                   len(seen))

                    x += 1

            print_star()
            priors[(p1, p2)] = []
            for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]:
                print p1, p2
                print h[0], h[1], h[0].value.count_subnodes()
                priors[(p1, p2)].append(
                    (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes()))
            complete += 1
            print "complete: %d" % complete

    for key in priors:
        print "***"
        print key
        for p in priors[key]:
            print p

        print "***"

    return priors
示例#17
0
def make_data(data_size=1, alpha=0.95):
    data = []
    randomWSList = random.sample(WSList, data_size)
    for i in range(len(randomWSList)):
        data.append(FunctionData(input=[randomWSList[i][0]], output=randomWSList[i][1], alpha=alpha))
    return data



# data = [ FunctionData(input=[WS0_initial], output=WS0_end, alpha=0.95)]
# data_1 = [ FunctionData(input=[WS0_initial], output=WS0_end, alpha=0.95), FunctionData(input=[WS1_initial], output=WS1_end, alpha=0.95)]
示例#18
0
def make_data(size=1, alpha=0.99):
    return [
        FunctionData(input=['aaaa'], output=True, alpha=alpha),
        FunctionData(input=['aaab'], output=False, alpha=alpha),
        FunctionData(input=['aabb'], output=False, alpha=alpha),
        FunctionData(input=['aaba'], output=False, alpha=alpha),
        FunctionData(input=['aca'], output=True, alpha=alpha),
        FunctionData(input=['aaca'], output=True, alpha=alpha),
        FunctionData(input=['a'], output=True, alpha=alpha)
    ] * size
示例#19
0
def make_data(N=30):
    """
    The data here consist of saffran-aslin-newport type strings. They have a geometric length distribution more like what
    you might find in natural data, with more frequent shorter strings. This is modeled in the hypothesis with a flip to
    whether or not you recurse to generate a longer string.
    """

    data = []
    cnt = Counter()
    for _ in xrange(N):
        cnt[''.join(sample_one(words) for _ in xrange(5))] += 1

    return [FunctionData(input=[], output=cnt)]
示例#20
0
def treebank2FunctionData(strs):
    """
        Parse treebank-style trees into FunctionNodes and return a list of FunctionData with the right format

        The data is in the following format:

            di.args = T
            di.output = []

        where we the data function "output" is implicit in T (depending on where we choose pronouns)
        SO: All the fanciness is handled in the likelihood
    """
    return map(
        lambda s: FunctionData(input=[list2FunctionNode(parseScheme(s))],
                               output=None), strs)
示例#21
0
def make_data(n=1):
    return [
        FunctionData(input=[1000.], output=1500., ll_sd=data_sd),
        FunctionData(input=[828.], output=1340., ll_sd=data_sd),
        FunctionData(input=[800.], output=1328., ll_sd=data_sd),
        FunctionData(input=[600.], output=1172., ll_sd=data_sd),
        FunctionData(input=[300.], output=800., ll_sd=data_sd),
        FunctionData(input=[0.], output=0.,
                     ll_sd=data_sd)  # added 0,0 since it makes physical sense.
    ] * n
示例#22
0
def myrun(observed_set):

    if LOTlib.SIG_INTERRUPTED:
        return set()

    h0 = NumberGameHypothesis(grammar=grammar)

    data = [FunctionData(input=[], output=observed_set, alpha=ALPHA)]

    tn = TopN(N=options.TOP_COUNT)
    for h in break_ctrlc(MHSampler(h0, data, steps=options.STEPS)):
        tn.add(h)

    print "# Finished %s" % str(observed_set)

    return set(tn.get_all())
示例#23
0
def make_data(n=1):

    data = []

    for _ in xrange(n):
        for a, b in itertools.product(OBJECTS, OBJECTS):

            myinput = [a, b]

            # opposites (n/p) interact; x interacts with nothing
            myoutput = (a[0] != b[0]) and (a[0] != 'x') and (b[0] != 'x')

            data.append(
                FunctionData(input=myinput, output=myoutput, alpha=0.99))

    return data
def get_top_N(pair1, pair2):
    priors = {}
    for p1 in pair1:
        for p2 in pair2:
            data = [
                FunctionData(alpha=alpha, input=[p1], output={p2: len(p2)})
            ]
            h0 = MyHypothesis()
            top_hyps = set()
            seen = set()

            x = 0
            while len(top_hyps) < n_top * 2:
                for h in MHSampler(h0, data, steps=steps):
                    #print y
                    out = h(p1)[:len(p2)]

                    str_h = str(h)
                    if len(out) == len(p2) and hamming_distance(out, p2) == 0:
                        if str_h not in seen:
                            top_hyps.add((copy.deepcopy(h), h.prior))
                            seen.add(str_h)

                    if x % 1000 == 0:
                        print p1, p2
                        print_star(x, h, out, p2, h.value.get_rule_signature())

                    x += 1
            print_star()
            priors[(p1, p2)] = []
            for h in sorted(top_hyps, key=lambda tup: -tup[1])[:n_top]:
                print p1, p2
                print h[0], h[1], h[0].value.count_subnodes()
                priors[(p1, p2)].append(
                    (copy.deepcopy(h[0]), h[1], h[0].value.count_subnodes()))

    for key in priors:
        print "***"
        print key
        for p in priors[key]:
            print p

        print "***"

    return priors
def main():


    lst = ["(", "[", "]", ")"]
    randomTstLists = [("(", ")", "[", "]"), ("[", "]", "(", ")")]

    #d will simulate N participants who center embed correctly at rate pC,
    #tail embed at rate pT and do something else the rest of the time
    #FINISH THIS!!!
    d = simulateData(lst, randomTstLists, pC=0.4, pT=0.6, N=500)
    for k in d.keys():
        print d, d[k]

    data = [ FunctionData(input=(), output=d, alpha=0.9) ]
    r = run(data, TOP=5, STEPS=5000)
    for i in r:
        print i
        print i[0](), i[0](), i[0]()
示例#26
0
def csvToFunctionData(filename):
    with open(filename, mode='rb') as f:
        reader = csv.reader(f)
        rows = [row for row in reader]
        ins = defaultdict(list())
        outs = defaultdict(list())

        # Fill `ins` and `outs` dictionaries
        for row in rows:
            if row[1] is 'in':
                ins[row[0]].append(row[2])
            if row[1] is 'out':
                outs[row[0]].append([row[2:]])

        # Fill FunctionData objects
        data_keys = set([row[0] for row in rows])
        data = {}
        for k in data_keys:
            data[k] = FunctionData(input=ins[k], output=outs[k])
        return data
示例#27
0
def make_data(data_size=300, alpha=0.75):
    """
    Sample some data according to the target
    """
    data = []
    for i in range(data_size):
        # how many in this set
        set_size = weighted_sample(
            range(1, 10 + 1),
            probs=[7187, 1484, 593, 334, 297, 165, 151, 86, 105, 112])
        # get the objects in the current set
        s = set(sample_sets_of_objects(set_size, all_objects))

        # sample according to the target
        if random() < alpha: r = WORDS[len(s) - 1]
        else: r = weighted_sample(WORDS)

        # and append the sampled utterance
        data.append(FunctionData(input=[s], output=r, alpha=alpha))
    return data
示例#28
0
 def make_data(size=datamt):
     return [
         FunctionData(input=[],
                      output={
                          'h e s': size,
                          'm e s': size,
                          'm e g': size,
                          'h e g': size,
                          'm e n': size,
                          'h e m': size,
                          'm e k': size,
                          'k e s': size,
                          'h e k': size,
                          'k e N': size,
                          'k e g': size,
                          'h e n': size,
                          'm e N': size,
                          'k e n': size,
                          'h e N': size,
                          'f e N': size,
                          'g e N': size,
                          'n e N': size,
                          'n e s': size,
                          'f e n': size,
                          'g e n': size,
                          'g e m': size,
                          'f e m': size,
                          'g e k': size,
                          'f e k': size,
                          'f e g': size,
                          'f e s': size,
                          'n e g': size,
                          'k e m': size,
                          'n e m': size,
                          'g e s': size,
                          'n e k': size
                      })
     ]
示例#29
0
文件: utils.py 项目: wrongu/LOTlib
def uniform_data(size, max_length=None):
    cnt = Counter()
    num = size * 2 / max_length
    for i in xrange(1, max_length/2+1):
        cnt['a'*i+'b'*i] = num
    return [FunctionData(input=[], output=cnt)]
def run(pairs):

    priors = {}
    complete = 0
    top_hyps = set()
    already_done = set()
    t_start = time.time()

    for pair in pairs:
        p1 = pair[0]
        p2 = pair[1]

        h0 = MyHypothesis()
        t_pair = time.time()
        #h0.start_counts = add_counts

        seen = set()
        #for ind in xrange(2, 3):
        for ind in xrange(len(p1) + 1):

            seen_round = set()
            x = 0
            p1_i = p1[:ind]
            p2_i = p2[:ind]
            if (p1, p2_i) not in already_done:
                already_done.add((p1, p2_i))
                data = [
                    FunctionData(alpha=alpha,
                                 input=[p1],
                                 output={p2_i: len(p2_i)})
                ]

                while len(seen_round) < n_top:
                    for h in MHSampler(h0,
                                       data,
                                       steps=steps,
                                       acceptance_temperature=acc_temp,
                                       prior_temperature=prior_temp):
                        if len(seen_round) >= n_top:
                            break
                        str_h = str(h.value)
                        out = h(p1)[:len(p2_i)]
                        if (len(out) == len(p2_i)
                                and (hamming_distance(out, p2_i) == 0)
                                and (len(h(p1)[:len(p1)]) == len(p1))):
                            if str_h not in seen:  #and "from" not in str_h[14:]:
                                l_rules = [
                                    str(i) for i in list(
                                        numpy.hstack(
                                            get_rule_counts(grammar, h.value)))
                                ]
                                top_hyps.add(
                                    (toAB(p1), ind, copy.deepcopy(h), toAB(p2),
                                     toAB(h(p1_i)[:len(p1)]),
                                     ",".join(l_rules), str(h.value)))
                                seen.add(str_h)
                            if str_h not in seen_round:
                                seen_round.add(str_h)

                        if x % 1000 == 0:
                            print_star(
                                "seen:%d" % len(seen_round), "steps:%d" % x,
                                "hyp:%s" % str_h, "p2:%s" % p2_i,
                                "out:%s" % out, "prior:%f" % h.prior,
                                "pair_time:%.2f" % (time.time() - t_pair),
                                "tot_time:%.2f" % (time.time() - t_start))

                        x += 1

        for h in top_hyps:
            print_star(h[0], h[1], h[2], h[3], h[4], h[5])

    return top_hyps