def run_icp(idx_x, idx_y, i):
    icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca)
    t0 = time.time()
    indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs, indices_x=idx_x, indices_y=idx_y)
    dt = time.time() - t0
    print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt))
    return indices_x, indices_y, rec, bb
def run_icp(s0, i):
    np.random.seed(s0 + i)
    icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca)
    t0 = time.time()
    indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs)
    dt = time.time() - t0
    print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt))
    return indices_x, indices_y, rec, bb
def run_icp_with_sample(i):
    sample = np.random.randint(1, src_W.shape[1], size=(10000))
    sample_src_W = np.take(a=src_W.copy(), indices=sample, axis=1)
    sample_tgt_W = np.take(a=tgt_W.copy(), indices=sample, axis=1)
    icp = ICPTrainer(sample_src_W, sample_tgt_W, True, params.n_pca)
    t0 = time.time()
    indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs, indices_x=np.arange(len(sample)), indices_y=np.arange(len(sample)))
    dt = time.time() - t0
    print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt))
    return indices_x, indices_y, rec, bb, sample_src_W, sample_tgt_W
示例#4
0
def sub_icp(src_W, tgt_W, n_icp_runs):
    def run_icp(s0, i):
        np.random.seed(s0 + i)
        icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca)
        t0 = time.time()
        # np.random.seed(50007)
        indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs)
        dt = time.time() - t0
        print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt))
        return indices_x, indices_y, rec, bb

    data = np.zeros((n_icp_runs, 2))  #100, 2

    best_idx_x = None
    best_idx_y = None

    min_rec = 1e8
    s0 = np.random.randint(50000)
    results = []
    if params.n_processes == 1:
        for i in range(n_icp_runs):
            results += [run_icp(s0, i)]
    else:
        pool = multiprocessing.Pool(processes=params.n_processes)
        for result in tqdm.tqdm(pool.imap_unordered(run_icp,
                                                    range(n_icp_runs)),
                                total=n_icp_runs):
            results += [result]
        pool.close()

    min_rec = 1e8
    min_bb = None
    for i, result in enumerate(results):
        indices_x, indices_y, rec, bb = result
        data[i, 0] = rec
        data[i, 1] = bb
        if rec < min_rec:
            best_idx_x = indices_x
            best_idx_y = indices_y
            min_rec = rec
            min_bb = bb

    idx = np.argmin(data[:, 0], 0)
    print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1]))
    icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
    _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, True,
                                        best_idx_x, best_idx_y)
    print("Training - Achieved: Rec %f BB %d" % (rec, bb))
    icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
    icp_ft.icp.TX = icp_train.icp.TX
    icp_ft.icp.TY = icp_train.icp.TY
    _, _, rec, bb = icp_ft.train_icp(params.icp_ft_epochs, do_reciprocal=True)

    print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb))
    TX = icp_ft.icp.TX
    TY = icp_ft.icp.TY
    return TX, TY
def train_model(src_lang, tgt_lang, src_W, tgt_W, n_runs, n_pca, n_processes,
                init_epochs, train_epochs, ft_epochs, n_ft, min_rec=1e8, min_bb=None):
    data = np.zeros((n_runs, 2))

    best_idx_x = None
    best_idx_y = None

    s0 = np.random.randint(50000)
    results = []
    if n_processes == 1:
        for i in range(n_runs):
            results += [_run_icp(src_W, tgt_W, s0, i, n_pca, init_epochs)]
    else:
        pool = multiprocessing.Pool(processes=n_processes)
        for result in tqdm.tqdm(pool.imap_unordered(_run_icp, range(n_runs)), total=n_runs):
            results += [result]
        pool.close()

    for i, result in enumerate(results):
        indices_x, indices_y, rec, bb = result
        data[i, 0] = rec
        data[i, 1] = bb
        if rec < min_rec:
            best_idx_x = indices_x
            best_idx_y = indices_y
            min_rec = rec
            min_bb = bb

    idx = np.argmin(data[:, 0], 0)
    print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1]))
    icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
    _, _, rec, bb = icp_train.train_icp(train_epochs, True, best_idx_x, best_idx_y)
    print("Training - Achieved: Rec %f BB %d" % (rec, bb))
    src_W = np.load("data/%s_%d.npy" % (src_lang, n_ft)).T
    tgt_W = np.load("data/%s_%d.npy" % (tgt_lang, n_ft)).T
    icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
    icp_ft.icp.TX = icp_train.icp.TX
    icp_ft.icp.TY = icp_train.icp.TY
    _, _, rec, bb = icp_ft.train_icp(ft_epochs, do_reciprocal=True)
    print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb))
    TX = icp_ft.icp.TX
    TY = icp_ft.icp.TY

    return TX, TY
            min_bb = bb
            best_sample_src_W = sample_src_W
            best_sample_tgt_W = sample_tgt_W
    print(best_sample_src_W)
    return best_idx_x, best_idx_y, best_sample_src_W, best_sample_tgt_W



#best_idx_x, best_idx_y, best_sample_src_W, best_sample_tgt_W = initialize_with_medical_dict_with_sample()

#best_idx_x, best_idx_y = initialize_with_medical_dict()

best_idx_x, best_idx_y = (np.arange(src_W.shape[1]), np.arange(src_W.shape[1]))
idx = np.argmin(data[:, 0], 0)
print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1]))
icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
_, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, True, indices_x=best_idx_x, indices_y=best_idx_y)
print("Training - Achieved: Rec %f BB %d" % (rec, bb))
src_W = np.load('data/%s_training.npy' % (params.src_lang)).T
tgt_W = np.load('data/%s_training.npy' % (params.tgt_lang)).T
icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
icp_ft.icp.TX = icp_train.icp.TX
icp_ft.icp.TY = icp_train.icp.TY
indices_x, indices_y, rec, bb = icp_ft.train_icp(params.icp_ft_epochs, do_reciprocal=True)
print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb))
TX = icp_ft.icp.TX
TY = icp_ft.icp.TY

if not os.path.exists(params.cp_dir):
    os.mkdir(params.cp_dir)
def match_by_reciprocal_pairs():
    x = np.load('data/%s_training.npy' % (params.src_lang)).astype('float32')
    y = np.load('data/%s_training.npy' % (params.tgt_lang)).astype('float32')

    n = x.shape[0]
    d = x.shape[1]
    ncentroids = 4
    niter = 20
    verbose = True

    kmeans_x = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
    kmeans_x.train(x)
    distances_x, clusters_x = kmeans_x.index.search(x, 1)
    print(CountFrequency(clusters_x.flatten()))

    kmeans_y = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
    kmeans_y.train(y)
    distances_y, clusters_y = kmeans_y.index.search(y, 1)
    print(CountFrequency(clusters_y.flatten()))

    indices_x = np.load('indices_x.npy')
    indices_y = np.load('indices_y.npy')

    assert len(indices_x) == len(indices_y) == len(x) == len(y)

    x_y_clusters_match = np.zeros((ncentroids, ncentroids))
    clusters_x = clusters_x.flatten()
    clusters_x_emb = []
    for i in range(ncentroids):
        clusters_x_emb.append([])
    for i in range(len(indices_x)):
        y_idx = indices_x[i]
        clusters_x_emb[clusters_x[i]].append(x[i])
        if (indices_y[y_idx]) == i:
            x_y_clusters_match[clusters_x[i]][clusters_y[indices_x[i]]] += 1

    print(np.array(clusters_x_emb[0]).shape)

    y_x_clusters_match = np.zeros((ncentroids, ncentroids)).astype('int32')
    clusters_y = clusters_y.flatten()
    clusters_y_emb = []
    for i in range(ncentroids):
        clusters_y_emb.append([])

    for i, v in enumerate(clusters_y_emb):
        clusters_y_emb[i] = []

    for i in range(len(indices_y)):
        y_idx = indices_y[i]
        clusters_y_emb[clusters_y[i]].append(y[i])
        if (indices_x[y_idx]) == i:
            y_x_clusters_match[clusters_y[i]][clusters_x[indices_y[i]]] += 1

    print(np.array(clusters_y_emb[0]).shape)

    print(x_y_clusters_match)
    print(y_x_clusters_match)

    x_cluster_maps = x_y_clusters_match.argmax(axis=1)
    y_cluster_maps = y_x_clusters_match.argmax(axis=1)

    #check that the clusters map is reciprocal
    assert ((x_cluster_maps[y_cluster_maps] == np.arange(ncentroids)).all())

    TX = np.load('output/en_es_T-medical.npy')
    TY = np.load('output/es_en_T-medical.npy')

    TX_clusters = [None for i in range(ncentroids)]
    TY_clusters = [None for i in range(ncentroids)]
    print(TX_clusters)

    for i in range(ncentroids):

        cur_cluster_x = i
        cur_cluster_y = x_cluster_maps[i]

        src_W = np.array(clusters_x_emb[cur_cluster_x]).T
        tgt_W = np.array(clusters_y_emb[cur_cluster_y]).T

        cluster_size = min(src_W.shape[1], tgt_W.shape[1])
        src_W = src_W[:, :cluster_size]
        tgt_W = tgt_W[:, :cluster_size]

        print(src_W.shape)
        print(tgt_W.shape)

        icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])

        icp_train.icp.TX = TX
        icp_train.icp.TY = TY

        _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs,
                                            is_init=False)

        print("Training - Achieved: Rec %f BB %d" % (rec, bb))
        icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0])
        icp_ft.icp.TX = icp_train.icp.TX
        icp_ft.icp.TY = icp_train.icp.TY
        ind_x, ind_y, rec, bb = icp_ft.train_icp(params.icp_ft_epochs,
                                                 do_reciprocal=True)
        print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb))
        TX_clusters[cur_cluster_x] = icp_ft.icp.TX
        TY_clusters[cur_cluster_y] = icp_ft.icp.TY
        #
    if not os.path.exists(params.cp_dir):
        os.mkdir(params.cp_dir)

    np.save(
        "%s/%s_%s_T_clusters-medical" %
        (params.cp_dir, params.src_lang, params.tgt_lang), TX_clusters)
    np.save(
        "%s/%s_%s_T_clusters-medical" %
        (params.cp_dir, params.tgt_lang, params.src_lang), TY_clusters)
    faiss.write_index(
        kmeans_x.index,
        "%s/%s_clusters_index" % (params.cp_dir, params.src_lang))
    faiss.write_index(
        kmeans_y.index,
        "%s/%s_clusters_index" % (params.cp_dir, params.tgt_lang))