Exemplo n.º 1
0
def SeekToRoot(con, user, dependant):
    result = []
    q = (("select governor, arctype from %s_procd "
          "where dependant = %%s"))
    q = q % user
    rows = con.query(q,dependant.encode("utf8"))
    if len(rows) == 0:
        return deptree.DependTree(None)
    row = random.choice(rows)
    result.append((row["arctype"],dependant))
    dependant = row['governor']
    while dependant != 'root':
        q = (("select sentence_id as sid, governor_id as gid from %s_procd "
              "where arctype = '%s' and governor = %%s and dependant = %%s"))
        q = q % (user, result[-1][0])
        rows = con.query(q, dependant, result[-1][1])
        assert len(rows) != 0
        row = random.choice(rows)
        rows = con.query("select governor, arctype from %s_procd where sentence_id = %s and dependant_id = %s" % (user, row["sid"], row["gid"]))
        assert len(rows) == 1, ("SeekToRoot",rows)
        result.append((rows[0]['arctype'], dependant))
        dependant = rows[0]['governor']
    result_tree = []
    for at, dep in result:
        result_tree = [(at, deptree.DependTree(dep, result_tree))]
    assert result_tree[0][0] == 'root', ("SeekToRoot",result_tree)
    return result_tree[0][1]
Exemplo n.º 2
0
def Expand(con, word, height=0, user=None, fixed_siblings = deptree.DependTree(None), dbg_out = {}, symbols={}, parent_arctype=None):
    arctypes = SubsetSelector(con, word, user=user, height = height, 
                              fixed_siblings = fixed_siblings, 
                              parent_arctype=parent_arctype,
                              dbg_out = dbg_out, symbols=symbols)
    outs = []
    for at,dep,fixd in arctypes:
        outs.append((at,Expand(con, dep,
                               height = height + 1, user=user, fixed_siblings=fixd, parent_arctype=at, symbols=symbols, dbg_out=dbg_out)))
    return deptree.DependTree(word,outs)
Exemplo n.º 3
0
def SubsetSelector(con, word, fixed_siblings=deptree.DependTree(None), height=0, user=None, params = None, dbg_out={}, symbols = {}, parent_arctype=None):
    if 'facts' not in dbg_out:
        dbg_out['facts'] = []
    if params is None:
        params = DEFAULT_PARAMS
    hist = HistogramSubsets(con, word, user=user, fixed_siblings=fixed_siblings, parent_arctype=parent_arctype)
    if len(hist) == 0:
        assert False,  "generated no rows, word = %s, \nfixed=%s" % (word, str(fixed_siblings))
        return []
    for i in xrange(len(hist)):
        denom = float(len(hist[i][0])) if height == 0 else (params["height_throttler"] * float(height))**len(hist[i][0])
        hist[i] = (hist[i], 0 if denom == 0 else (1.0/denom))
        for s,k in symbols.iteritems():
            if s in [hr[1] for hr in hist[i][0][0]]:                
                dbg_out['facts'].append("Bumped %s by %f" % (s,4+k))
                hist[i] = (hist[i][0], hist[i][1]*(4+k))
    result_entry = RandomWeightedChoice(hist)
    q = "select * from %s_procd where sentence_id = %s and governor_id = %s" % (user, result_entry[1], result_entry[2])
    result = [(r["arctype"], r["dependant"], r["dependant_id"]) for r in con.query(q)]
    assert sorted([(a,b) for a,b,c in result]) == sorted(result_entry[0]), "%s\n%s\n%s\n%s" % (result,result_entry)

    if "used_list" not in dbg_out:
        dbg_out["used_list"] = []
    if len(result) != 0:
        dbg_out["used_list"].append(int(result_entry[1]))

    fixed_ixs = set([])
    for at,fs in fixed_siblings.children:
        found = False
        for i in xrange(len(result)):
            if (result[i][0],result[i][1]) == (at,fs.data):
                fixed_ixs.add(i)
                result[i] = (at, fs.data, deptree.DependTree(None, fs.children))
                found = True
                break
        assert found, ("not found",at,fs.data,result)
    for i in xrange(len(result)):
        if i not in fixed_ixs:
            fixd = deptree.DependTree(None)
            result[i] = (result[i][0], result[i][1], fixd)
        
    for i in xrange(len(result)):
        if result[i][0] in params["arc_wildness"] and random.random() < params["arc_wildness"][result[i][0]]:
            if i not in fixed_ixs:
                next_word = RandomDependant(con, user, word, result[i][0], symbols=symbols)
                dbg_out['facts'].append(result[i][1] +"->"+next_word)
                result[i] = (result[i][0], next_word, result[i][2])
        if result[i][1] in symbols:
            del symbols[result[i][1]]
    return result
Exemplo n.º 4
0
def Generate(con, user, using=None, dbg_out={}, symbols={}):
    if not using is None:
        fixed_chain = SeekToRoot(con, user, using)
        if fixed_chain.data is None:
            return None
        word = fixed_chain.data
        fixed_chain.data = None
    else:
        fixed_chain = deptree.DependTree(None)
        word = random.choice(con.query("select dependant from %s_procd where arctype = 'root'" % user))['dependant']
    global g_last_generated
    result = Expand(con, word, parent_arctype='root', user=user, fixed_siblings=fixed_chain,dbg_out=dbg_out, symbols=symbols)
    g_last_generated = copy.deepcopy(result)
    return result
Exemplo n.º 5
0
def MultiAuxRW(t):
    auxs = t.FindAll("aux")
    CHECK(len(auxs) > 1)
    for a in auxs:
        CHECK(t.Child(a).IsLeaf())
    negs = t.FindAll("neg")
    for a in negs:
        CHECK(t.Child(a).IsLeaf())
    auxstext = [t.ChildStr(a) for a in auxs + negs]
    ordr = ["to", "would", "not", "_", "have"]
    auxstext.sort(key=lambda itm: ordr.index(itm if itm in ordr else "_"))
    for i in xrange(len(auxs)):
        t.Pop(t.FindNoCheck("aux"))
    for i in xrange(len(negs)):
        t.Pop(t.FindNoCheck("neg"))
    assert t.FindNoCheck("aux") is None
    assert t.FindNoCheck("neg") is None
    t.children.append(("aux", deptree.DependTree(" ".join(auxstext))))
    return t
Exemplo n.º 6
0
def HistogramSubsets(con, word, parent_arctype = None, fixed_siblings=deptree.DependTree(None), user = None):
    subs =  ("select dl.sentence_id as sid, dl.dependant_id as did, "
             "group_concat(dr.arctype separator '___')   as gc_arc, "
             "group_concat(dr.dependant separator '___') as gc_dep, "
             "count(dr.dependant) as groupsize "
             "from %s_procd dl left join %s_procd dr "
             "on dl.sentence_id = dr.sentence_id and dl.dependant_id = dr.governor_id "
             "where dl.dependant = %%s %s "
             "group by dl.sentence_id, dl.dependant_id ")
    extra_cond = ""
    params = [word]
    if not parent_arctype is None:
        extra_cond += ("and dl.arctype = '%s'" % parent_arctype)
    subs = subs % (user, user, extra_cond)
    q = subs
    t0 = time.time()    
    qres = con.query(q, *params)
    maxgroupsize = max([int(r["groupsize"]) for r in qres])
    if maxgroupsize == 0:
        qres = [qres[0]]
    t1 = time.time()
    hists = [( ([] if r["gc_arc"] is None else r["gc_arc"].split("___")),
               ([] if r["gc_dep"] is None else r["gc_dep"].split("___")),
               r["sid"],
               r["did"]) 
             for r in qres]
    disallowed = ["cc"]
    disallowed.extend(["num","number"]) # this will add some stability for now...
    if len(hists) == 0:
        assert False,  "before filtering no rows"
    result = []
    assert fixed_siblings.data is None
    fixed_tups = [(fs[0], fs[1].data) for fs in fixed_siblings.children]
    t2 = time.time()
    for h in hists:
        assert len(h[0]) == len(h[1]), h
        if len([x for x in h[0] if x in disallowed]) == 0 and len([x for x in h[0] if x == "nsubj"]) < 2:
            zipd = zip(h[0],h[1])
            if Subset(zipd, fixed_tups):
                result.append((zipd, h[2], h[3]))
    return result