def run_icp(idx_x, idx_y, i): icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca) t0 = time.time() indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs, indices_x=idx_x, indices_y=idx_y) dt = time.time() - t0 print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt)) return indices_x, indices_y, rec, bb
def run_icp(s0, i): np.random.seed(s0 + i) icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca) t0 = time.time() indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs) dt = time.time() - t0 print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt)) return indices_x, indices_y, rec, bb
def run_icp_with_sample(i): sample = np.random.randint(1, src_W.shape[1], size=(10000)) sample_src_W = np.take(a=src_W.copy(), indices=sample, axis=1) sample_tgt_W = np.take(a=tgt_W.copy(), indices=sample, axis=1) icp = ICPTrainer(sample_src_W, sample_tgt_W, True, params.n_pca) t0 = time.time() indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs, indices_x=np.arange(len(sample)), indices_y=np.arange(len(sample))) dt = time.time() - t0 print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt)) return indices_x, indices_y, rec, bb, sample_src_W, sample_tgt_W
def sub_icp(src_W, tgt_W, n_icp_runs): def run_icp(s0, i): np.random.seed(s0 + i) icp = ICPTrainer(src_W.copy(), tgt_W.copy(), True, params.n_pca) t0 = time.time() # np.random.seed(50007) indices_x, indices_y, rec, bb = icp.train_icp(params.icp_init_epochs) dt = time.time() - t0 print("%d: Rec %f BB %d Time: %f" % (i, rec, bb, dt)) return indices_x, indices_y, rec, bb data = np.zeros((n_icp_runs, 2)) #100, 2 best_idx_x = None best_idx_y = None min_rec = 1e8 s0 = np.random.randint(50000) results = [] if params.n_processes == 1: for i in range(n_icp_runs): results += [run_icp(s0, i)] else: pool = multiprocessing.Pool(processes=params.n_processes) for result in tqdm.tqdm(pool.imap_unordered(run_icp, range(n_icp_runs)), total=n_icp_runs): results += [result] pool.close() min_rec = 1e8 min_bb = None for i, result in enumerate(results): indices_x, indices_y, rec, bb = result data[i, 0] = rec data[i, 1] = bb if rec < min_rec: best_idx_x = indices_x best_idx_y = indices_y min_rec = rec min_bb = bb idx = np.argmin(data[:, 0], 0) print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1])) icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, True, best_idx_x, best_idx_y) print("Training - Achieved: Rec %f BB %d" % (rec, bb)) icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) icp_ft.icp.TX = icp_train.icp.TX icp_ft.icp.TY = icp_train.icp.TY _, _, rec, bb = icp_ft.train_icp(params.icp_ft_epochs, do_reciprocal=True) print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb)) TX = icp_ft.icp.TX TY = icp_ft.icp.TY return TX, TY
def train_model(src_lang, tgt_lang, src_W, tgt_W, n_runs, n_pca, n_processes, init_epochs, train_epochs, ft_epochs, n_ft, min_rec=1e8, min_bb=None): data = np.zeros((n_runs, 2)) best_idx_x = None best_idx_y = None s0 = np.random.randint(50000) results = [] if n_processes == 1: for i in range(n_runs): results += [_run_icp(src_W, tgt_W, s0, i, n_pca, init_epochs)] else: pool = multiprocessing.Pool(processes=n_processes) for result in tqdm.tqdm(pool.imap_unordered(_run_icp, range(n_runs)), total=n_runs): results += [result] pool.close() for i, result in enumerate(results): indices_x, indices_y, rec, bb = result data[i, 0] = rec data[i, 1] = bb if rec < min_rec: best_idx_x = indices_x best_idx_y = indices_y min_rec = rec min_bb = bb idx = np.argmin(data[:, 0], 0) print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1])) icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) _, _, rec, bb = icp_train.train_icp(train_epochs, True, best_idx_x, best_idx_y) print("Training - Achieved: Rec %f BB %d" % (rec, bb)) src_W = np.load("data/%s_%d.npy" % (src_lang, n_ft)).T tgt_W = np.load("data/%s_%d.npy" % (tgt_lang, n_ft)).T icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) icp_ft.icp.TX = icp_train.icp.TX icp_ft.icp.TY = icp_train.icp.TY _, _, rec, bb = icp_ft.train_icp(ft_epochs, do_reciprocal=True) print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb)) TX = icp_ft.icp.TX TY = icp_ft.icp.TY return TX, TY
min_bb = bb best_sample_src_W = sample_src_W best_sample_tgt_W = sample_tgt_W print(best_sample_src_W) return best_idx_x, best_idx_y, best_sample_src_W, best_sample_tgt_W #best_idx_x, best_idx_y, best_sample_src_W, best_sample_tgt_W = initialize_with_medical_dict_with_sample() #best_idx_x, best_idx_y = initialize_with_medical_dict() best_idx_x, best_idx_y = (np.arange(src_W.shape[1]), np.arange(src_W.shape[1])) idx = np.argmin(data[:, 0], 0) print("Init - Achieved: Rec %f BB %d" % (data[idx, 0], data[idx, 1])) icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, True, indices_x=best_idx_x, indices_y=best_idx_y) print("Training - Achieved: Rec %f BB %d" % (rec, bb)) src_W = np.load('data/%s_training.npy' % (params.src_lang)).T tgt_W = np.load('data/%s_training.npy' % (params.tgt_lang)).T icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) icp_ft.icp.TX = icp_train.icp.TX icp_ft.icp.TY = icp_train.icp.TY indices_x, indices_y, rec, bb = icp_ft.train_icp(params.icp_ft_epochs, do_reciprocal=True) print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb)) TX = icp_ft.icp.TX TY = icp_ft.icp.TY if not os.path.exists(params.cp_dir): os.mkdir(params.cp_dir)
def match_by_reciprocal_pairs(): x = np.load('data/%s_training.npy' % (params.src_lang)).astype('float32') y = np.load('data/%s_training.npy' % (params.tgt_lang)).astype('float32') n = x.shape[0] d = x.shape[1] ncentroids = 4 niter = 20 verbose = True kmeans_x = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose) kmeans_x.train(x) distances_x, clusters_x = kmeans_x.index.search(x, 1) print(CountFrequency(clusters_x.flatten())) kmeans_y = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose) kmeans_y.train(y) distances_y, clusters_y = kmeans_y.index.search(y, 1) print(CountFrequency(clusters_y.flatten())) indices_x = np.load('indices_x.npy') indices_y = np.load('indices_y.npy') assert len(indices_x) == len(indices_y) == len(x) == len(y) x_y_clusters_match = np.zeros((ncentroids, ncentroids)) clusters_x = clusters_x.flatten() clusters_x_emb = [] for i in range(ncentroids): clusters_x_emb.append([]) for i in range(len(indices_x)): y_idx = indices_x[i] clusters_x_emb[clusters_x[i]].append(x[i]) if (indices_y[y_idx]) == i: x_y_clusters_match[clusters_x[i]][clusters_y[indices_x[i]]] += 1 print(np.array(clusters_x_emb[0]).shape) y_x_clusters_match = np.zeros((ncentroids, ncentroids)).astype('int32') clusters_y = clusters_y.flatten() clusters_y_emb = [] for i in range(ncentroids): clusters_y_emb.append([]) for i, v in enumerate(clusters_y_emb): clusters_y_emb[i] = [] for i in range(len(indices_y)): y_idx = indices_y[i] clusters_y_emb[clusters_y[i]].append(y[i]) if (indices_x[y_idx]) == i: y_x_clusters_match[clusters_y[i]][clusters_x[indices_y[i]]] += 1 print(np.array(clusters_y_emb[0]).shape) print(x_y_clusters_match) print(y_x_clusters_match) x_cluster_maps = x_y_clusters_match.argmax(axis=1) y_cluster_maps = y_x_clusters_match.argmax(axis=1) #check that the clusters map is reciprocal assert ((x_cluster_maps[y_cluster_maps] == np.arange(ncentroids)).all()) TX = np.load('output/en_es_T-medical.npy') TY = np.load('output/es_en_T-medical.npy') TX_clusters = [None for i in range(ncentroids)] TY_clusters = [None for i in range(ncentroids)] print(TX_clusters) for i in range(ncentroids): cur_cluster_x = i cur_cluster_y = x_cluster_maps[i] src_W = np.array(clusters_x_emb[cur_cluster_x]).T tgt_W = np.array(clusters_y_emb[cur_cluster_y]).T cluster_size = min(src_W.shape[1], tgt_W.shape[1]) src_W = src_W[:, :cluster_size] tgt_W = tgt_W[:, :cluster_size] print(src_W.shape) print(tgt_W.shape) icp_train = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) icp_train.icp.TX = TX icp_train.icp.TY = TY _, _, rec, bb = icp_train.train_icp(params.icp_train_epochs, is_init=False) print("Training - Achieved: Rec %f BB %d" % (rec, bb)) icp_ft = ICPTrainer(src_W, tgt_W, False, src_W.shape[0]) icp_ft.icp.TX = icp_train.icp.TX icp_ft.icp.TY = icp_train.icp.TY ind_x, ind_y, rec, bb = icp_ft.train_icp(params.icp_ft_epochs, do_reciprocal=True) print("Reciprocal Pairs - Achieved: Rec %f BB %d" % (rec, bb)) TX_clusters[cur_cluster_x] = icp_ft.icp.TX TY_clusters[cur_cluster_y] = icp_ft.icp.TY # if not os.path.exists(params.cp_dir): os.mkdir(params.cp_dir) np.save( "%s/%s_%s_T_clusters-medical" % (params.cp_dir, params.src_lang, params.tgt_lang), TX_clusters) np.save( "%s/%s_%s_T_clusters-medical" % (params.cp_dir, params.tgt_lang, params.src_lang), TY_clusters) faiss.write_index( kmeans_x.index, "%s/%s_clusters_index" % (params.cp_dir, params.src_lang)) faiss.write_index( kmeans_y.index, "%s/%s_clusters_index" % (params.cp_dir, params.tgt_lang))