Exemplo n.º 1
0
def info_dataset(dev=NA):
    dev = nav(dev, DEV)
    n, pos, neg = Path('so.pair').file().load()
    sim = Path('so.sim').file().load().to(dev)
    rs = []
    for pair in [pos, neg]:
        pair = TensData(pair, 1, s=TRUE)(tens0((n, n), dt=DTB, dev=dev))
        r = []
        for i in range(n):
            t = sim[i][pair[i]]
            a = t.min().tolist()
            m = t.mean().tolist()
            b = t.max().tolist()
            r.append([a, m, b])
            pr(i, a, m, b)
        t = tens(r, dt=DTR, dev=dev)
        ma = t[:, 0].min().tolist()
        a = t[:, 0].mean().tolist()
        m = t[:, 1].mean().tolist()
        b = t[:, 2].mean().tolist()
        mb = t[:, 2].max().tolist()
        t = [ma, a, m, b, mb]
        pl(*t)
        rs.append(t)
    pr.params({'exts': '10'})(rs)
Exemplo n.º 2
0
def draw_qxl(x, y):
    before_draw()
    t = tens([x, y], dt=DTR).view(1, -1)
    r = (tense(t.numel(), dt=DTR) - t.t().matmul(t)).tolist()
    t = t.squeeze().tolist()
    plt.arrow(0,
              0,
              *t,
              length_includes_head=TRUE,
              head_width=0.1,
              head_length=0.1,
              ec='b',
              fc='b')
    for i in r:
        plt.arrow(*t,
                  *i,
                  length_includes_head=TRUE,
                  head_width=0.1,
                  head_length=0.1,
                  ec='r',
                  fc='r')
    dom = 5
    plt.xlim(-dom, dom)
    plt.ylim(-dom / 2, dom / 2)
    after_draw()
    plt.show()
Exemplo n.º 3
0
def proc_km(k, v):
    v = tens(v, dt=DTR, dev=dev)
    ms = v.mean(dim=0).mul(100).tolist()
    cs = v.std(dim=0).mul(100).tolist()
    ms = [round(i, 4) for i in ms]
    cs = [round(i, 4) for i in cs]
    s = dict(zip(k, zip(ms, cs)))
    return s
Exemplo n.º 4
0
def draw_sj(k=NA, t=NA, n=NA):
    k = nav(k, [0.1, 0.3, 0.5, 0.7, 0.9])
    t = nav(t, 100)
    n = nav(n, 1000)
    before_draw()
    l = len(k)
    k = tens(k, dt=DTR)
    a = k**(1 / t)
    x = tensa(n, dt=DTR)
    y = a.view(-1, 1)**x
    for i in range(l):
        plt.plot(x.numpy(),
                 y[i].numpy(),
                 label=f'{k[i].tolist():.1f}:{a[i].tolist():.4f}')
    plt.legend()
    after_draw()
    plt.show()
Exemplo n.º 5
0
def draw_bkm(fn):
    from matplotlib import pyplot as plt
    plt.rcParams['figure.figsize'] = (40, 15)
    plt.rcParams['figure.dpi'] = 100
    plt.rcParams['savefig.dpi'] = 100

    r = {}
    for i in get_lines(File(fn)):
        k, v = i.split(':', 1)
        dn, sn, pn, _ = k.split('-')
        sn = f'{sn}-{pn}'
        v = eval(v)
        if dn not in r: r[dn] = {}
        for c in v:
            if c not in r[dn]: r[dn][c] = {}
            if sn not in r[dn][c]: r[dn][c][sn] = []
            t = v[c]
            if isTuple(t): t = t[0]
            r[dn][c][sn].append(t)
    i = 0
    for dn in r:
        for c in r[dn]:
            plt.subplot(241 + i)
            for sn in r[dn][c]:
                t = r[dn][c][sn]
                x = (tensa(len(t), dt=DTI) + 1).numpy()
                y = tens(t, dt=DTR).numpy()
                plt.plot(x, y, label=sn)
            plt.title(f'{dn}-{c}')
            plt.legend()
            plt.grid()
            i += 1
    bj = 0.05
    jj = 0.2
    plt.subplots_adjust(left=bj,
                        right=1 - bj,
                        bottom=bj,
                        top=1 - bj,
                        wspace=jj,
                        hspace=jj)
    plt.savefig(f'{fn}.jpg')
    plt.clf()
    plt.cla()
Exemplo n.º 6
0
def get_wvs():
    import numpy as np

    def load_word_vec_skipgram(wid):
        file = File('data/wv/GoogleNews-vectors-negative300.bin')
        dt = np.dtype('float32')
        with file.open('rb') as f:
            header = f.readline().decode()
            vocab_size, vec_size = map(int, header.split())  # 300_0000, 300
            yield vocab_size, vec_size
            vec_len = vec_size * dt.itemsize  # 1200
            for line in range(vocab_size):
                word = []
                while TRUE:
                    b = f.read(1)
                    if b == b' ': break
                    elif b != b'\n': word.append(b)
                word = b''.join(word).decode()
                vec = f.read(vec_len)
                if word in wid:
                    yield wid[word], np.frombuffer(vec, dtype=dt)

    for dn in dns:
        print(f'dn: {dn}')
        xpp = File(f'data/{dn}-xpp').load()
        wid = TFIDF(xpp).wid
        print(f'size_vocab: {len(wid)}')
        load_word_vec = load_word_vec_skipgram(wid)
        _, vec_size = next(load_word_vec)
        wvs = {
            w: tens(v, dt=DTR, dev=dev).view(1, -1)
            for w, v in load_word_vec
        }
        print(f'size_vocab_wv: {len(wvs)}')
        wvs = [(wvs[i] if i in wvs else tens0((1, vec_size), dt=DTR, dev=dev))
               for i in range(len(wid))]
        wvs = tc.cat(wvs, dim=0)
        File(f'{dn}-wvs').store(wvs)
Exemplo n.º 7
0
def run_bkm_proposed(fn):
    def do_train_cos(self):
        ids = self.batch[0]
        r = self.model(*self.batch)
        r = tens_unit(r, dim=1)
        r = r.matmul(r.t())

        s = self.sup()
        ids = ids.to(s.device)
        s = s[ids][:, ids]
        s = s.to(self.dev)

        loss = ((r - s)**2).mean()
        loss = loss * self.weight()
        return loss

    do_train = do_train_cos
    get_kmeans = KMeansCos

    init_rand(88888888)
    rb = RunBert()
    rb.init_path()
    rb.init_dev(dev)
    file = File(f'res/{fn}')
    for dn in dns:
        if dn == 'so': continue

        y = File(f'data/{dn}-y').load()
        x = File(f'data/{dn}-x').load()
        rb.init_data((x, y))
        for sn in sns:
            if sn in ['tfidf', 'skipgram']:
                if dn == 'gs' and sn == 'tfidf': continue
                if dn == 'bm' and sn == 'skipgram': continue

            sim = File(f'data/{dn}-sim-{sn}').load()
            sim_size = sim.shape[0]
            rb.sim = sim
            del sim
            for pn in range(3):
                if pn in [0, 1]: continue

                if pn in [0, 1]:
                    eig = File(f'data/{dn}-sim-{sn}-eig').load()
                    eig = tens(eig, dt=DTR, dev=dev)
                    eigk = (1, 0.1)[pn]
                    ids = tensa(len(y), dt=DTI, dev=dev)
                    ids = ids[eig >= eigk].tolist()
                    print(f'len(ids)={len(ids)}, eigk={eigk}')
                    del eig, eigk
                else:
                    ids = NA
                model_name = f'model-{dn}-{sn}-{pn}'
                rb.init_model()

                iters = 100
                iter_count = 150

                def get_sampler(dataset):
                    sp_u = SamplerPair(dataset, 0, sim_size, ids=ids, nks=10)
                    sp_u = SamplerPairn(dataset, sp_u, iters, iter_count)
                    return sp_u

                rb.init_sampler_eval()
                rb.init_sampler(get_sampler)
                rb.init_optimizer()
                for i in range(iters):
                    rb.train(do_train)
                    epoch_loss = rb.epoch_loss
                    rb.clear_train()

                    rb.init_repr()
                    rb.eval()
                    rb.clear_eval()

                    km = rb.kmeans(get_kmeans)
                    rb.clear_repr()

                    k, v = run_km(km)
                    del km
                    s = proc_km(k, v)
                    s['loss'] = round(epoch_loss, 4)

                    t = f'{dn}-{sn}-{pn}-{i+1}: {s}'
                    file.writea(t + '\n')
                    print(f'{TimeDate().valstr()}: {t}')

                rb.clear_optimizer()
                rb.clear_sampler()
                rb.clear_sampler_eval()
                rb.clear_model()
                file.writea('\n')
                print('=' * 64)
            rb.sim = NA
        rb.clear_data()
Exemplo n.º 8
0
        epoch += 1
        if epoch % 100 == 0: pr()
        if epoch > epochs: break
        if tl < 0.01: nconv += 1
        else: nconv = 0
        if nconv >= 10: break
    pl()
    model = model.cpu()
    with tc.no_grad():
        for k, v in model.named_parameters():
            pr(k, v.squeeze().tolist())
        pl()
        while TRUE:
            s = pris()
            try:
                tx = tens([[real(s)]], dt=DTR)
                ty = func(tx)
                r = model(tx)
                l = loss(r, ty).tolist()
                r = r.squeeze().tolist()
                ty = ty.squeeze().tolist()
                pr(y=r, Y=ty, loss=l)
            except Exception as e:
                pr(e)
                s = pris()
                lr, epochs = [real(i) for i in s.split()]
                break

exit()