def run(ref, model_path, num_clusters, num_cluster_samples, seed, out_cluster_samples_file_hier, max_examples, out_cluster_samples_file, data_path, view1_col, view2_col, label_col, sampling_strategy, mvsc_no_unk): torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) id_to_token, token_to_id, vocab_size, word_emb_size, mvc_encoder = \ multiview_encoders.load_model(model_path) print('loaded model') print('loading dataset') dataset = Dataset(data_path, view1_col=view1_col, view2_col=view2_col, label_col=label_col) n_cluster = len(dataset.id_to_label) - 1 print("loaded dataset, num of class = %d" % n_cluster) idxes = dataset.trn_idx_no_unk if mvsc_no_unk else dataset.trn_idx trn_idx = [x.item() for x in np.random.permutation(idxes)] if max_examples is not None: trn_idx = trn_idx[:max_examples] num_clusters = n_cluster if num_clusters is None else num_clusters print('clustering over num clusters', num_clusters) mvsc = multiview.mvsc.MVSC(k=n_cluster) latent_z1s, golds = transform(dataset, trn_idx, mvc_encoder, view='v1') latent_z2s, _ = transform(dataset, trn_idx, mvc_encoder, view='v2') print('running mvsc', end='', flush=True) start = time.time() preds, eivalues, eivectors, sigmas = mvsc.fit_transform( [latent_z1s, latent_z2s], [False] * 2) print('...done') mvsc_time = time.time() - start print('time taken %.3f' % mvsc_time) lgolds, lpreds = [], [] for g, p in zip(golds, list(preds)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) silhouette = sklearn.metrics.silhouette_score(latent_z1s, preds, metric='euclidean') davies_bouldin = sklearn.metrics.davies_bouldin_score(latent_z1s, preds) print( f'{datetime.datetime.now()} pretrain: eval prec={prec:.4f} rec={rec:.4f} f1={f1:.4f} ' f'acc={acc:.4f} sil={silhouette:.4f}, db={davies_bouldin:.4f}')
def calc_prec_rec_f1_acc(preds, golds): lgolds, lpreds = [], [] for g, p in zip(golds, list(preds)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) return prec, rec, f1, acc
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-path', type=str, default='./data/airlines_processed.csv') parser.add_argument('--glove-path', type=str, default='./data/glove.840B.300d.txt') parser.add_argument('--pre-model', type=str, choices=['ae', 'qt'], default='qt') parser.add_argument('--pre-epoch', type=int, default=0) parser.add_argument('--pt-batch', type=int, default=100) parser.add_argument('--model-path', type=str, help='path of pretrained model to load') parser.add_argument('--way', type=int, default=5) parser.add_argument('--num-epochs', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--save-model-path', type=str) parser.add_argument('--view1-col', type=str, default='view1') parser.add_argument('--view2-col', type=str, default='view2') parser.add_argument('--label-col', type=str, default='label') args = parser.parse_args() np.random.seed(args.seed) print('loading dataset') dataset = Dataset(args.data_path, view1_col=args.view1_col, view2_col=args.view2_col, label_col=args.label_col) n_cluster = len(dataset.id_to_label) - 1 print("num of class = %d" % n_cluster) if args.model_path is not None: id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model( args.model_path) print('loaded model') else: id_to_token, token_to_id, vocab_size, word_emb_size, model = \ multiview_encoders.from_embeddings( args.glove_path, dataset.id_to_token, dataset.token_to_id) print('created randomly initialized model') print('vocab_size', vocab_size) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) expressions = (model, optimizer) pre_acc, pre_state, pre_state_epoch = 0., None, None pretrain_method = { 'ae': pretrain.pretrain_ae, 'qt': pretrain.pretrain_qt, }[args.pre_model] for epoch in range(1, args.pre_epoch + 1): model.train() perm_idx = np.random.permutation(dataset.trn_idx) trn_loss, _ = pretrain_method(dataset, perm_idx, expressions, train=True) model.eval() _, tst_acc = pretrain_method(dataset, dataset.tst_idx, expressions, train=False) if tst_acc > pre_acc: pre_state = copy.deepcopy(model.state_dict()) pre_acc = tst_acc pre_state_epoch = epoch print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format( datetime.datetime.now(), epoch, trn_loss, tst_acc)) if args.pre_epoch > 0: # load best state model.load_state_dict(pre_state) print(f'loaded best state from epoch {pre_state_epoch}') # deepcopy pretrained views into v1 and/or view2 { 'ae': pretrain.after_pretrain_ae, 'qt': pretrain.after_pretrain_qt, }[args.pre_model](model) # reinitialiate optimizer optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) expressions = (model, optimizer) print('applied post-pretraining') kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster, max_iter=300, verbose=0, random_state=0) z_v1, golds = transform(dataset, dataset.trn_idx, model, encoder='v1') preds_v1 = kmeans.fit_predict(z_v1) lgolds, lpreds = [], [] for g, p in zip(golds, list(preds_v1)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) print( f'{datetime.datetime.now()} pretrain: test prec={prec:.4f} rec={rec:.4f} ' f'f1={f1:.4f} acc={acc:.4f}') shot, way, query = 5, args.way, 15 preds_v2 = None best_epoch, best_model, best_dev_f1 = None, None, None for epoch in range(1, args.num_epochs + 1): trn_loss = 0. _loss, preds_v2, tst_preds_v2 = run_one_side(model=model, optimizer=optimizer, preds_left=preds_v1, pt_batch=args.pt_batch, way=way, shot=shot, query=query, n_cluster=n_cluster, dataset=dataset, right_encoder_side='v2') trn_loss += _loss _loss, preds_v1, tst_preds_v1 = run_one_side(model=model, optimizer=optimizer, preds_left=preds_v2, pt_batch=args.pt_batch, way=way, shot=shot, query=query, n_cluster=n_cluster, dataset=dataset, right_encoder_side='v1') trn_loss += _loss dev_f1 = cluster_metrics.calc_f1( gnd_assignments=torch.LongTensor(tst_preds_v1).to(device), pred_assignments=torch.LongTensor(tst_preds_v2).to(device)) dev_acc = cluster_metrics.calc_ACC( torch.LongTensor(tst_preds_v2).to(device), torch.LongTensor(tst_preds_v1).to(device)) print('dev view 1 vs view 2: f1={:.4f} acc={:.4f}'.format( dev_f1, dev_acc)) if best_dev_f1 is None or dev_f1 > best_dev_f1: print('new best epoch', epoch) best_epoch = epoch best_dev_f1 = dev_f1 best_model = copy.deepcopy(model.state_dict()) best_preds_v1 = preds_v1.copy() best_preds_v2 = preds_v2.copy() lgolds, lpreds = [], [] for g, p in zip(golds, list(preds_v1)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) print( f'{datetime.datetime.now()} epoch {epoch}, test prec={prec:.4f} rec={rec:.4f} ' f'f1={f1:.4f} acc={acc:.4f}') print('restoring model for best dev epoch', best_epoch) model.load_state_dict(best_model) preds_v1, preds_v2 = best_preds_v1, best_preds_v2 lgolds, lpreds = [], [] for g, p in zip(golds, list(preds_v1)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) print( f'{datetime.datetime.now()} test prec={prec:.4f} rec={rec:.4f} f1={f1:.4f} acc={acc:.4f}' ) if args.save_model_path is not None: preds_v1 = torch.from_numpy(preds_v1) if preds_v2 is not None: preds_v2 = torch.from_numpy(preds_v2) state = { 'model_state': model.state_dict(), 'id_to_token': dataset.id_to_token, 'word_emb_size': word_emb_size, 'v1_assignments': preds_v1, 'v2_assignments': preds_v2 } with open(expand(args.save_model_path), 'wb') as f: torch.save(state, f) print('saved model to ', args.save_model_path)
def run(data_path, model, pca_dims, view1_col, view2_col, label_col, no_idf, mvsc_no_unk): print('loading dataset') dataset = Dataset(data_path, view1_col=view1_col, view2_col=view2_col, label_col=label_col) n_cluster = len(dataset.id_to_label) - 1 print("num of class = %d" % n_cluster) vocab_size = len(dataset.token_to_id) print('vocab_size', vocab_size) if model == 'mvsc': try: import multiview except Exception: print('please install https://github.com/mariceli3/multiview') return print('imported multiview ok') def run_pca(features): print('fitting tfidf vectorizer', flush=True, end='') vectorizer = TfidfVectorizer(token_pattern='\\d+', ngram_range=(1, 1), analyzer='word', min_df=0.0, max_df=1.0, use_idf=not no_idf) X = vectorizer.fit_transform(features) print(' ... done') print('X.shape', X.shape) print('running pca', flush=True, end='') pca = TruncatedSVD(n_components=pca_dims) X2 = pca.fit_transform(X) print(' ... done') return X2 golds = [dataset[idx][1] for idx in dataset.trn_idx] if model in ['view1pca', 'view2pca', 'wholeconvpca']: if model == 'view1pca': utts = [dataset[idx][0][0] for idx in dataset.trn_idx] utts = [' '.join([str(idx) for idx in utt]) for utt in utts] elif model == 'view2pca': convs = [dataset[idx][0][1] for idx in dataset.trn_idx] utts = [[tok for utt in conv for tok in utt] for conv in convs] utts = [' '.join([str(idx) for idx in utt]) for utt in utts] elif model == 'wholeconvpca': v1 = [dataset[idx][0][0] for idx in dataset.trn_idx] convs = [dataset[idx][0][1] for idx in dataset.trn_idx] v2 = [[tok for utt in conv for tok in utt] for conv in convs] utts = [] for n in range(len(v1)): utts.append(v1[n] + v2[n]) utts = [' '.join([str(idx) for idx in utt]) for utt in utts] X2 = run_pca(utts) print('running kmeans', flush=True, end='') kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster, max_iter=300, verbose=0, random_state=0) preds = kmeans.fit_predict(X2) print(' ... done') elif model == 'mvsc': mvsc = multiview.mvsc.MVSC(k=n_cluster) idxes = dataset.trn_idx_no_unk if mvsc_no_unk else dataset.trn_idx v1 = [dataset[idx][0][0] for idx in idxes] convs = [dataset[idx][0][1] for idx in idxes] v2 = [[tok for utt in conv for tok in utt] for conv in convs] v1 = [' '.join([str(idx) for idx in utt]) for utt in v1] v2 = [' '.join([str(idx) for idx in utt]) for utt in v2] v1_pca = run_pca(v1) v2_pca = run_pca(v2) print('running mvsc', end='', flush=True) start = time.time() preds, eivalues, eivectors, sigmas = mvsc.fit_transform( [v1_pca, v2_pca], [False] * 2) print('...done') mvsc_time = time.time() - start print('time taken %.3f' % mvsc_time) lgolds, lpreds = [], [] for g, p in zip(golds, list(preds)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1( gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC( torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) print( f'{datetime.datetime.now()} eval f1={f1:.4f} prec={prec:.4f} rec={rec:.4f} acc={acc:.4f}' ) return prec, rec, f1, acc
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-path', type=str, default='./data/airlines_processed.csv') parser.add_argument('--glove-path', type=str, default='./data/glove.840B.300d.txt') parser.add_argument('--pre-model', type=str, choices=['ae', 'qt'], default='qt') parser.add_argument('--pre-epoch', type=int, default=0) parser.add_argument('--pt-batch', type=int, default=100) parser.add_argument('--model-path', type=str, help='path of pretrained model to load') parser.add_argument('--way', type=int, default=5) parser.add_argument('--num-epochs', type=int, default=100) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--save-model-path', type=str) parser.add_argument('--view1-col', type=str, default='view1') parser.add_argument('--view2-col', type=str, default='view2') parser.add_argument('--label-col', type=str, default='tag') args = parser.parse_args() np.random.seed(args.seed) print('loading dataset') dataset = Dataset(args.data_path, view1_col=args.view1_col, view2_col=args.view2_col, label_col=args.label_col) n_cluster = len(dataset.id_to_label) - 1 print ("num of class = %d" %n_cluster) if args.model_path is not None: id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model(args.model_path) print('loaded model') else: id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.create_model_from_embeddings( args.glove_path, dataset.id_to_token, dataset.token_to_id) print('created randomly initialized model') print('vocab_size', vocab_size) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) expressions = (model, optimizer) pre_acc, pre_state = 0., None pretrain_method = { 'ae': pretrain.pretrain_ae, 'qt': pretrain.pretrain_qt, }[args.pre_model] for epoch in range(1, args.pre_epoch + 1): model.train() perm_idx = np.random.permutation(dataset.trn_idx) trn_loss, _ = pretrain_method(dataset, perm_idx, expressions, train=True) model.eval() _, tst_acc = pretrain_method(dataset, dataset.tst_idx, expressions, train=False) if tst_acc > pre_acc: pre_state = copy.deepcopy(model.state_dict()) pre_acc = tst_acc print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format(datetime.datetime.now(), epoch, trn_loss, tst_acc)) if args.save_model_path is not None: save_model_path = f'{args.save_model_path}_pre_e{epoch}.dat' state = { 'model_state': model.state_dict(), 'id_to_token': dataset.id_to_token, 'word_emb_size': word_emb_size } with open(expand(save_model_path), 'wb') as f: torch.save(state, f) print('saved model to ', save_model_path) save_model_path = f'{args.save_model_path}_pre_best_e{epoch}.dat' state = { 'model_state': pre_state, 'id_to_token': dataset.id_to_token, 'word_emb_size': word_emb_size } with open(expand(save_model_path), 'wb') as f: torch.save(state, f) print('saved model to ', save_model_path) if args.pre_epoch > 0: # load best state model.load_state_dict(pre_state) print('loaded best state') # deepcopy pretrained views into v1 and/or view2 { 'ae': pretrain.after_pretrain_ae, 'qt': pretrain.after_pretrain_qt, }[args.pre_model](model) # reinitialiate optimizer optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) expressions = (model, optimizer) print('applied post-pretraining') kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster, max_iter=300, verbose=0, random_state=0) latent_z1s, golds = transform(dataset, dataset.trn_idx, model, encoder='v1') pred1s = kmeans.fit_predict(latent_z1s) lgolds, lpreds = [], [] for g, p in zip(golds, list(pred1s)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s) print('{} pretrain: eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format(datetime.datetime.now(), prec, rec, f1, acc, silhouette, davies_bouldin)) perm_idx = dataset.trn_idx pred2s, centroids1, centroids2, pred1s_perm_idx, preds2_perm_idx = None, None, None, None, None for epoch in range(1, args.num_epochs + 1): trn_loss = 0. shot, way, query = 5, args.way, 15 sampler1 = CategoriesSampler(pred1s, args.pt_batch, way, shot+query) train1_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler1] trn_loss += do_pass(train1_batches, shot, way, query, expressions, encoder='v2') latent_z2s, _ = transform(dataset, perm_idx, model, encoder='v2') centroids2 = calc_centroids(latent_z2s, pred1s, n_cluster) kmeans2 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids2, max_iter=10, verbose=0) pred2s = kmeans2.fit_predict(latent_z2s) pred2s_perm_idx = perm_idx.copy() tst_latent_z2s, _ = transform(dataset, dataset.tst_idx, model, encoder='v2') tst_pred2s = kmeans2.predict(tst_latent_z2s) sampler2 = CategoriesSampler(pred2s, args.pt_batch, way, shot+query) train2_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler2] trn_loss += do_pass(train2_batches, shot, way, query, expressions, encoder='v1') perm_idx = np.random.permutation(dataset.trn_idx) latent_z1s, golds = transform(dataset, perm_idx, model, encoder='v1') centroids1 = calc_centroids(latent_z1s, pred2s, n_cluster) kmeans1 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids1, max_iter=10, verbose=0) pred1s = kmeans1.fit_predict(latent_z1s) pred1s_perm_idx = perm_idx.copy() tst_latent_z1s, _ = transform(dataset, dataset.tst_idx, model, encoder='v1') tst_pred1s = kmeans1.predict(tst_latent_z1s) f1 = cluster_metrics.calc_f1(gnd_assignments=torch.LongTensor(tst_pred1s).to(device), pred_assignments=torch.LongTensor(tst_pred2s).to(device)) acc = cluster_metrics.calc_ACC(torch.LongTensor(tst_pred2s).to(device), torch.LongTensor(tst_pred1s).to(device)) print('TEST f1={:.4f} acc={:.4f}'.format(f1, acc)) lgolds, lpreds = [], [] for g, p in zip(golds, list(pred1s)): if g > 0: lgolds.append(g) lpreds.append(p) prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device)) acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device)) silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s) print('{} epoch {}, eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format( datetime.datetime.now(), epoch, prec, rec, f1, acc, silhouette, davies_bouldin)) if args.save_model_path is not None: pred1s = torch.from_numpy(pred1s) if pred2s is not None: pred2s = torch.from_numpy(pred2s) state = { 'model_state': model.state_dict(), 'id_to_token': dataset.id_to_token, 'word_emb_size': word_emb_size, 'v1_assignments': pred1s, 'v2_assignments': pred2s, 'pred1s_perm_idx': pred1s_perm_idx, 'pred2s_perm_idx': pred2s_perm_idx } with open(expand(args.save_model_path), 'wb') as f: torch.save(state, f) print('saved model to ', args.save_model_path)