示例#1
0
 def play(*x, **kw):
     hidecorrect = False
     if "hidecorrect" in kw:
         hidecorrect = kw["hidecorrect"]
     if len(x) == 1:
         x = x[0]
         q = wordids2string(qmat_x[x],
                            rwd=qrwd,
                            maskid=maskid,
                            reverse=True)
         ga = wordids2string(amat_x[x, 1:], rwd=arwd, maskid=maskid)
         pred = encdec.predict(qmat_x[x:x + 1], amati_x[x:x + 1, :-1])
         pa = wordids2string(np.argmax(pred[0], axis=1),
                             rwd=arwd,
                             maskid=maskid)
         if hidecorrect and ga == pa[:len(ga)]:  # correct
             return False
         else:
             print "{}: {}".format(x, q)
             print ga
             print pa
             return True
     elif len(x) == 0:
         for i in range(0, qmat_x.shape[0]):
             r = play(i)
             if r:
                 raw_input()
     else:
         raise Exception("invalid argument to play")
示例#2
0
def to_char_level(qmat, amat, qdic, adic, maskid):
    qmat = wordmat2charmat(qmat, qdic, maxlen=1000, maskid=maskid)
    amat = wordmat2charmat(amat, adic, maxlen=1000, maskid=maskid)
    qmat[qmat > 0] += 2
    amat[amat > 0] += 2
    qdic = dict([(chr(x), x + 2) for x in range(np.max(qmat))])
    adic = dict([(chr(x), x + 2) for x in range(np.max(amat))])
    qdic.update({"<RARE>": 1})
    adic.update({"<RARE>": 1})
    print wordids2string(qmat[0], {v: k for k, v in qdic.items()})
    print wordids2string(amat[0], {v: k for k, v in adic.items()})
    return qmat, amat, qdic, adic
示例#3
0
 def searchwordmat(self, wordmat, wd, top=5):
     cans = []
     rwd = {v: k for k, v in wd.items()}
     tt = ticktock("wordmatsearcher")
     tt.tick("started searching")
     for i in range(wordmat.shape[0]):
         sentence = wordids2string(wordmat[i], rwd=rwd)
         #ssentence.replace(" '", "")
         res = self.searchsentence(sentence, top=top)
         cans.append([r["fb_id"] for r in res])
         tt.progress(i, wordmat.shape[0], live=True)
     tt.tock("done searching")
     return cans
示例#4
0
 def xpp(i):
     print wordids2string(qmat_x[i], rqdic, 0)
     print wordids2string(amat_x[i], radic, 0)
示例#5
0
 def tpp(i):
     print wordids2string(qmat_t[i], rqdic, 0)
     print wordids2string(amat_t[i], radic, 0)
示例#6
0
 def pp(i):
     print wordids2string(qmat_auto[i],
                          {v: k
                           for k, v in qdic_auto.items()}, 0)
     print wordids2string(amat_auto[i], {v: k
                                         for k, v in adic.items()}, 0)
示例#7
0
 def pp(i):
     print wordids2string(newtqmat[i], rqdic, 0)
     print wordids2string(newtamat[i], radic, 0)
示例#8
0
 def pp(x):
     print wordids2string([int(xe) for xe in x[0].split()], rqdic, maskid=0)
     print wordids2string([int(xe) for xe in x[1].split()], radic, maskid=0)
示例#9
0
 def pp(i):
     print wordids2string(qmat[i], {v: k for k, v in qdic.items()})
     print wordids2string(amat[i], {v: k for k, v in adic.items()})
示例#10
0
def preprocess(qmat,
               amat,
               qdic,
               adic,
               qwc,
               awc,
               maskid,
               qreversed=False,
               dorare=True):
    # TODO: add positional replacement and change other functions accordingly
    amat[amat == adic["capital:c"]] = adic["capital:t"]
    replaceina = set()
    for k in adic:
        if (k[-2:] in ":c :s :r :m :n".split()
                or k[-3:] in ":lo :co".split()) and not k == "capital:c":
            replaceina.add(k)
    for r in replaceina:
        splits = r.split(":")
        rt = splits[1] + "-type"
        if not rt in adic:
            adic[rt] = max(adic.values()) + 1
        if not rt in qdic:
            qdic[rt] = max(qdic.values()) + 1
    radic = {v: k for k, v in adic.items()}
    rqdic = {v: k for k, v in qdic.items()}
    for i in range(qmat.shape[0]):
        if i == 379:
            pass
        for j in range(amat.shape[1]):
            if amat[i, j] in {adic[x] for x in replaceina}:
                sf = radic[amat[i, j]].split(":")[0].split("_")
                #if sf[-1] == "river" or len(sfs[0][-1]) == 2:
                #    sf = sf[:-1]
                sft = radic[amat[i, j]].split(":")[1]
                amat[i, j] = adic[sft + "-type"]
                sfs = [sf]
                qmati = qmat[i]
                if qreversed:
                    qmatio = maskid * np.ones_like(qmati)
                    m = qmati.shape[0] - 1
                    n = 0
                    while m >= 0:
                        if qmati[m] == maskid:
                            pass
                        else:
                            qmatio[n] = qmati[m]
                            n += 1
                        m -= 1
                    qmati = qmatio
                if sf == ["usa"]:
                    sfs.append("united states".split())
                    sfs.append("the country".split())
                    sfs.append("the states".split())
                    sfs.append(["us"])
                    sfs.append(["america"])
                for sf in sfs:
                    k = 0
                    done = False
                    while k < qmat.shape[1]:
                        if qmati[k] != maskid and \
                                        rqdic[qmati[k]] == sf[0]:
                            l = 0
                            while l < len(sf) and l + k < qmat.shape[1]:
                                if rqdic[qmati[k + l]] == sf[l]:
                                    l += 1
                                else:
                                    break
                            if l >= len(sf) - (1 if sf[0] != "the" else 0):
                                qmati[k] = qdic[sft + "-type"]
                                qmati[k + 1:qmat.shape[1] - l + 1] = qmati[k +
                                                                           l:]
                                qmati[qmat.shape[1] - l + 1:] = maskid
                                done = True
                                break
                        k += 1
                    if done:
                        break
                if qreversed:
                    qmatio = maskid * np.ones_like(qmati)
                    m = qmati.shape[0] - 1
                    n = 0
                    while m >= 0:
                        if qmati[m] == maskid:
                            pass
                        else:
                            qmatio[n] = qmati[m]
                            n += 1
                        m -= 1
                    qmati = qmatio
                qmat[i] = qmati
    # test
    wop = []
    for i in range(qmat.shape[0]):
        if "-type" in wordids2string(amat[i], {v: k for k, v in adic.items()}) and \
            "-type" not in wordids2string(qmat[i], {v: k for k, v in qdic.items()}):
            wop.append(i)
    print "{}/{}".format(len(wop), qmat.shape[0])
    # rare words
    if dorare:
        rareset = set(
            map(
                lambda (x, y): x,
                filter(lambda (x, y): y < 2,
                       sorted(qwc.items(), key=lambda (x, y): y))))
        rareids = {qdic[x] for x in rareset}
        qmat = np.vectorize(lambda x: qdic["<RARE>"]
                            if x in rareids else x)(qmat)

        def pp(i):
            print wordids2string(qmat[i], {v: k for k, v in qdic.items()})
            print wordids2string(amat[i], {v: k for k, v in adic.items()})

    #embed()

    return qmat, amat, qdic, adic, qwc, awc