예제 #1
0
def run(ref, model_path, num_clusters, num_cluster_samples, seed,
        out_cluster_samples_file_hier, max_examples, out_cluster_samples_file,
        data_path, view1_col, view2_col, label_col, sampling_strategy,
        mvsc_no_unk):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    id_to_token, token_to_id, vocab_size, word_emb_size, mvc_encoder = \
        multiview_encoders.load_model(model_path)
    print('loaded model')

    print('loading dataset')
    dataset = Dataset(data_path,
                      view1_col=view1_col,
                      view2_col=view2_col,
                      label_col=label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print("loaded dataset, num of class = %d" % n_cluster)

    idxes = dataset.trn_idx_no_unk if mvsc_no_unk else dataset.trn_idx
    trn_idx = [x.item() for x in np.random.permutation(idxes)]
    if max_examples is not None:
        trn_idx = trn_idx[:max_examples]

    num_clusters = n_cluster if num_clusters is None else num_clusters
    print('clustering over num clusters', num_clusters)

    mvsc = multiview.mvsc.MVSC(k=n_cluster)
    latent_z1s, golds = transform(dataset, trn_idx, mvc_encoder, view='v1')
    latent_z2s, _ = transform(dataset, trn_idx, mvc_encoder, view='v2')
    print('running mvsc', end='', flush=True)
    start = time.time()
    preds, eivalues, eivectors, sigmas = mvsc.fit_transform(
        [latent_z1s, latent_z2s], [False] * 2)
    print('...done')
    mvsc_time = time.time() - start
    print('time taken %.3f' % mvsc_time)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))
    silhouette = sklearn.metrics.silhouette_score(latent_z1s,
                                                  preds,
                                                  metric='euclidean')
    davies_bouldin = sklearn.metrics.davies_bouldin_score(latent_z1s, preds)
    print(
        f'{datetime.datetime.now()} pretrain: eval prec={prec:.4f} rec={rec:.4f} f1={f1:.4f} '
        f'acc={acc:.4f} sil={silhouette:.4f}, db={davies_bouldin:.4f}')
예제 #2
0
def calc_prec_rec_f1_acc(preds, golds):
    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))
    return prec, rec, f1, acc
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path',
                        type=str,
                        default='./data/airlines_processed.csv')
    parser.add_argument('--glove-path',
                        type=str,
                        default='./data/glove.840B.300d.txt')
    parser.add_argument('--pre-model',
                        type=str,
                        choices=['ae', 'qt'],
                        default='qt')
    parser.add_argument('--pre-epoch', type=int, default=0)
    parser.add_argument('--pt-batch', type=int, default=100)
    parser.add_argument('--model-path',
                        type=str,
                        help='path of pretrained model to load')
    parser.add_argument('--way', type=int, default=5)
    parser.add_argument('--num-epochs', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)

    parser.add_argument('--save-model-path', type=str)

    parser.add_argument('--view1-col', type=str, default='view1')
    parser.add_argument('--view2-col', type=str, default='view2')
    parser.add_argument('--label-col', type=str, default='label')
    args = parser.parse_args()

    np.random.seed(args.seed)

    print('loading dataset')
    dataset = Dataset(args.data_path,
                      view1_col=args.view1_col,
                      view2_col=args.view2_col,
                      label_col=args.label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print("num of class = %d" % n_cluster)

    if args.model_path is not None:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model(
            args.model_path)
        print('loaded model')
    else:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = \
            multiview_encoders.from_embeddings(
                args.glove_path, dataset.id_to_token, dataset.token_to_id)
        print('created randomly initialized model')
    print('vocab_size', vocab_size)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    expressions = (model, optimizer)

    pre_acc, pre_state, pre_state_epoch = 0., None, None
    pretrain_method = {
        'ae': pretrain.pretrain_ae,
        'qt': pretrain.pretrain_qt,
    }[args.pre_model]
    for epoch in range(1, args.pre_epoch + 1):
        model.train()
        perm_idx = np.random.permutation(dataset.trn_idx)
        trn_loss, _ = pretrain_method(dataset,
                                      perm_idx,
                                      expressions,
                                      train=True)
        model.eval()
        _, tst_acc = pretrain_method(dataset,
                                     dataset.tst_idx,
                                     expressions,
                                     train=False)
        if tst_acc > pre_acc:
            pre_state = copy.deepcopy(model.state_dict())
            pre_acc = tst_acc
            pre_state_epoch = epoch
        print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format(
            datetime.datetime.now(), epoch, trn_loss, tst_acc))

    if args.pre_epoch > 0:
        # load best state
        model.load_state_dict(pre_state)
        print(f'loaded best state from epoch {pre_state_epoch}')

        # deepcopy pretrained views into v1 and/or view2
        {
            'ae': pretrain.after_pretrain_ae,
            'qt': pretrain.after_pretrain_qt,
        }[args.pre_model](model)

        # reinitialiate optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        expressions = (model, optimizer)
        print('applied post-pretraining')

    kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster,
                                    max_iter=300,
                                    verbose=0,
                                    random_state=0)
    z_v1, golds = transform(dataset, dataset.trn_idx, model, encoder='v1')
    preds_v1 = kmeans.fit_predict(z_v1)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds_v1)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))

    print(
        f'{datetime.datetime.now()} pretrain: test prec={prec:.4f} rec={rec:.4f} '
        f'f1={f1:.4f} acc={acc:.4f}')

    shot, way, query = 5, args.way, 15

    preds_v2 = None
    best_epoch, best_model, best_dev_f1 = None, None, None
    for epoch in range(1, args.num_epochs + 1):
        trn_loss = 0.

        _loss, preds_v2, tst_preds_v2 = run_one_side(model=model,
                                                     optimizer=optimizer,
                                                     preds_left=preds_v1,
                                                     pt_batch=args.pt_batch,
                                                     way=way,
                                                     shot=shot,
                                                     query=query,
                                                     n_cluster=n_cluster,
                                                     dataset=dataset,
                                                     right_encoder_side='v2')
        trn_loss += _loss

        _loss, preds_v1, tst_preds_v1 = run_one_side(model=model,
                                                     optimizer=optimizer,
                                                     preds_left=preds_v2,
                                                     pt_batch=args.pt_batch,
                                                     way=way,
                                                     shot=shot,
                                                     query=query,
                                                     n_cluster=n_cluster,
                                                     dataset=dataset,
                                                     right_encoder_side='v1')
        trn_loss += _loss

        dev_f1 = cluster_metrics.calc_f1(
            gnd_assignments=torch.LongTensor(tst_preds_v1).to(device),
            pred_assignments=torch.LongTensor(tst_preds_v2).to(device))
        dev_acc = cluster_metrics.calc_ACC(
            torch.LongTensor(tst_preds_v2).to(device),
            torch.LongTensor(tst_preds_v1).to(device))

        print('dev view 1 vs view 2: f1={:.4f} acc={:.4f}'.format(
            dev_f1, dev_acc))

        if best_dev_f1 is None or dev_f1 > best_dev_f1:
            print('new best epoch', epoch)
            best_epoch = epoch
            best_dev_f1 = dev_f1
            best_model = copy.deepcopy(model.state_dict())
            best_preds_v1 = preds_v1.copy()
            best_preds_v2 = preds_v2.copy()

        lgolds, lpreds = [], []
        for g, p in zip(golds, list(preds_v1)):
            if g > 0:
                lgolds.append(g)
                lpreds.append(p)
        prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
            gnd_assignments=torch.LongTensor(lgolds).to(device),
            pred_assignments=torch.LongTensor(lpreds).to(device))
        acc = cluster_metrics.calc_ACC(
            torch.LongTensor(lpreds).to(device),
            torch.LongTensor(lgolds).to(device))

        print(
            f'{datetime.datetime.now()} epoch {epoch}, test prec={prec:.4f} rec={rec:.4f} '
            f'f1={f1:.4f} acc={acc:.4f}')

    print('restoring model for best dev epoch', best_epoch)
    model.load_state_dict(best_model)
    preds_v1, preds_v2 = best_preds_v1, best_preds_v2

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds_v1)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))
    print(
        f'{datetime.datetime.now()} test prec={prec:.4f} rec={rec:.4f} f1={f1:.4f} acc={acc:.4f}'
    )

    if args.save_model_path is not None:
        preds_v1 = torch.from_numpy(preds_v1)
        if preds_v2 is not None:
            preds_v2 = torch.from_numpy(preds_v2)
        state = {
            'model_state': model.state_dict(),
            'id_to_token': dataset.id_to_token,
            'word_emb_size': word_emb_size,
            'v1_assignments': preds_v1,
            'v2_assignments': preds_v2
        }
        with open(expand(args.save_model_path), 'wb') as f:
            torch.save(state, f)
        print('saved model to ', args.save_model_path)
예제 #4
0
def run(data_path, model, pca_dims, view1_col, view2_col, label_col, no_idf,
        mvsc_no_unk):
    print('loading dataset')
    dataset = Dataset(data_path,
                      view1_col=view1_col,
                      view2_col=view2_col,
                      label_col=label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print("num of class = %d" % n_cluster)

    vocab_size = len(dataset.token_to_id)
    print('vocab_size', vocab_size)

    if model == 'mvsc':
        try:
            import multiview
        except Exception:
            print('please install https://github.com/mariceli3/multiview')
            return
        print('imported multiview ok')

    def run_pca(features):
        print('fitting tfidf vectorizer', flush=True, end='')
        vectorizer = TfidfVectorizer(token_pattern='\\d+',
                                     ngram_range=(1, 1),
                                     analyzer='word',
                                     min_df=0.0,
                                     max_df=1.0,
                                     use_idf=not no_idf)
        X = vectorizer.fit_transform(features)
        print(' ... done')
        print('X.shape', X.shape)

        print('running pca', flush=True, end='')
        pca = TruncatedSVD(n_components=pca_dims)
        X2 = pca.fit_transform(X)
        print(' ... done')
        return X2

    golds = [dataset[idx][1] for idx in dataset.trn_idx]

    if model in ['view1pca', 'view2pca', 'wholeconvpca']:
        if model == 'view1pca':
            utts = [dataset[idx][0][0] for idx in dataset.trn_idx]
            utts = [' '.join([str(idx) for idx in utt]) for utt in utts]
        elif model == 'view2pca':
            convs = [dataset[idx][0][1] for idx in dataset.trn_idx]
            utts = [[tok for utt in conv for tok in utt] for conv in convs]
            utts = [' '.join([str(idx) for idx in utt]) for utt in utts]
        elif model == 'wholeconvpca':
            v1 = [dataset[idx][0][0] for idx in dataset.trn_idx]
            convs = [dataset[idx][0][1] for idx in dataset.trn_idx]
            v2 = [[tok for utt in conv for tok in utt] for conv in convs]
            utts = []
            for n in range(len(v1)):
                utts.append(v1[n] + v2[n])
            utts = [' '.join([str(idx) for idx in utt]) for utt in utts]

        X2 = run_pca(utts)

        print('running kmeans', flush=True, end='')
        kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster,
                                        max_iter=300,
                                        verbose=0,
                                        random_state=0)
        preds = kmeans.fit_predict(X2)
        print(' ... done')
    elif model == 'mvsc':
        mvsc = multiview.mvsc.MVSC(k=n_cluster)
        idxes = dataset.trn_idx_no_unk if mvsc_no_unk else dataset.trn_idx
        v1 = [dataset[idx][0][0] for idx in idxes]
        convs = [dataset[idx][0][1] for idx in idxes]
        v2 = [[tok for utt in conv for tok in utt] for conv in convs]
        v1 = [' '.join([str(idx) for idx in utt]) for utt in v1]
        v2 = [' '.join([str(idx) for idx in utt]) for utt in v2]
        v1_pca = run_pca(v1)
        v2_pca = run_pca(v2)
        print('running mvsc', end='', flush=True)
        start = time.time()
        preds, eivalues, eivectors, sigmas = mvsc.fit_transform(
            [v1_pca, v2_pca], [False] * 2)
        print('...done')
        mvsc_time = time.time() - start
        print('time taken %.3f' % mvsc_time)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(preds)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(
        gnd_assignments=torch.LongTensor(lgolds).to(device),
        pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(
        torch.LongTensor(lpreds).to(device),
        torch.LongTensor(lgolds).to(device))

    print(
        f'{datetime.datetime.now()} eval f1={f1:.4f} prec={prec:.4f} rec={rec:.4f} acc={acc:.4f}'
    )

    return prec, rec, f1, acc
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', type=str, default='./data/airlines_processed.csv')
    parser.add_argument('--glove-path', type=str, default='./data/glove.840B.300d.txt')
    parser.add_argument('--pre-model', type=str, choices=['ae', 'qt'], default='qt')
    parser.add_argument('--pre-epoch', type=int, default=0)
    parser.add_argument('--pt-batch', type=int, default=100)
    parser.add_argument('--model-path', type=str, help='path of pretrained model to load')
    parser.add_argument('--way', type=int, default=5)
    parser.add_argument('--num-epochs', type=int, default=100)
    parser.add_argument('--seed', type=int, default=0)

    parser.add_argument('--save-model-path', type=str)

    parser.add_argument('--view1-col', type=str, default='view1')
    parser.add_argument('--view2-col', type=str, default='view2')
    parser.add_argument('--label-col', type=str, default='tag')
    args = parser.parse_args()

    np.random.seed(args.seed)

    print('loading dataset')
    dataset = Dataset(args.data_path, view1_col=args.view1_col, view2_col=args.view2_col, label_col=args.label_col)
    n_cluster = len(dataset.id_to_label) - 1
    print ("num of class = %d" %n_cluster)

    if args.model_path is not None:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.load_model(args.model_path)
        print('loaded model')
    else:
        id_to_token, token_to_id, vocab_size, word_emb_size, model = multiview_encoders.create_model_from_embeddings(
            args.glove_path, dataset.id_to_token, dataset.token_to_id)
        print('created randomly initialized model')
    print('vocab_size', vocab_size)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    expressions = (model, optimizer)

    pre_acc, pre_state = 0., None
    pretrain_method = {
        'ae': pretrain.pretrain_ae,
        'qt': pretrain.pretrain_qt,
    }[args.pre_model]
    for epoch in range(1, args.pre_epoch + 1):
        model.train()
        perm_idx = np.random.permutation(dataset.trn_idx)
        trn_loss, _ = pretrain_method(dataset, perm_idx, expressions, train=True)
        model.eval()
        _, tst_acc = pretrain_method(dataset, dataset.tst_idx, expressions, train=False)
        if tst_acc > pre_acc:
            pre_state = copy.deepcopy(model.state_dict())
            pre_acc = tst_acc
        print('{} epoch {}, train_loss={:.4f} test_acc={:.4f}'.format(datetime.datetime.now(), epoch, trn_loss, tst_acc))
        if args.save_model_path is not None:
            save_model_path = f'{args.save_model_path}_pre_e{epoch}.dat'
            state = {
                'model_state': model.state_dict(),
                'id_to_token': dataset.id_to_token,
                'word_emb_size': word_emb_size
            }
            with open(expand(save_model_path), 'wb') as f:
                torch.save(state, f)
            print('saved model to ', save_model_path)

            save_model_path = f'{args.save_model_path}_pre_best_e{epoch}.dat'
            state = {
                'model_state': pre_state,
                'id_to_token': dataset.id_to_token,
                'word_emb_size': word_emb_size
            }
            with open(expand(save_model_path), 'wb') as f:
                torch.save(state, f)
            print('saved model to ', save_model_path)

    if args.pre_epoch > 0:
        # load best state
        model.load_state_dict(pre_state)
        print('loaded best state')

        # deepcopy pretrained views into v1 and/or view2
        {
            'ae': pretrain.after_pretrain_ae,
            'qt': pretrain.after_pretrain_qt,
        }[args.pre_model](model)

        # reinitialiate optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
        expressions = (model, optimizer)
        print('applied post-pretraining')

    kmeans = sklearn.cluster.KMeans(n_clusters=n_cluster, max_iter=300, verbose=0, random_state=0)
    latent_z1s, golds = transform(dataset, dataset.trn_idx, model, encoder='v1')
    pred1s = kmeans.fit_predict(latent_z1s)

    lgolds, lpreds = [], []
    for g, p in zip(golds, list(pred1s)):
        if g > 0:
            lgolds.append(g)
            lpreds.append(p)
    prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device))
    acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device))
    silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s)

    print('{} pretrain: eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format(datetime.datetime.now(), prec, rec, f1, acc, silhouette, davies_bouldin))
    perm_idx = dataset.trn_idx
    pred2s, centroids1, centroids2, pred1s_perm_idx, preds2_perm_idx = None, None, None, None, None
    for epoch in range(1, args.num_epochs + 1):
        trn_loss = 0.

        shot, way, query = 5, args.way, 15
        sampler1 = CategoriesSampler(pred1s, args.pt_batch, way, shot+query)
        train1_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler1]
        trn_loss += do_pass(train1_batches, shot, way, query, expressions, encoder='v2')

        latent_z2s, _ = transform(dataset, perm_idx, model, encoder='v2')
        centroids2 = calc_centroids(latent_z2s, pred1s, n_cluster)
        kmeans2 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids2, max_iter=10, verbose=0)
        pred2s = kmeans2.fit_predict(latent_z2s)
        pred2s_perm_idx = perm_idx.copy()
        tst_latent_z2s, _ = transform(dataset, dataset.tst_idx, model, encoder='v2')
        tst_pred2s = kmeans2.predict(tst_latent_z2s)

        sampler2 = CategoriesSampler(pred2s, args.pt_batch, way, shot+query)
        train2_batches = [[dataset[perm_idx[idx]] for idx in indices] for indices in sampler2]
        trn_loss += do_pass(train2_batches, shot, way, query, expressions, encoder='v1')

        perm_idx = np.random.permutation(dataset.trn_idx)
        latent_z1s, golds = transform(dataset, perm_idx, model, encoder='v1')
        centroids1 = calc_centroids(latent_z1s, pred2s, n_cluster)
        kmeans1 = sklearn.cluster.KMeans(n_clusters=n_cluster, init=centroids1, max_iter=10, verbose=0)
        pred1s = kmeans1.fit_predict(latent_z1s)
        pred1s_perm_idx = perm_idx.copy()
        tst_latent_z1s, _ = transform(dataset, dataset.tst_idx, model, encoder='v1')
        tst_pred1s = kmeans1.predict(tst_latent_z1s)

        f1 = cluster_metrics.calc_f1(gnd_assignments=torch.LongTensor(tst_pred1s).to(device), pred_assignments=torch.LongTensor(tst_pred2s).to(device))
        acc = cluster_metrics.calc_ACC(torch.LongTensor(tst_pred2s).to(device), torch.LongTensor(tst_pred1s).to(device))

        print('TEST f1={:.4f} acc={:.4f}'.format(f1, acc))

        lgolds, lpreds = [], []
        for g, p in zip(golds, list(pred1s)):
            if g > 0:
                lgolds.append(g)
                lpreds.append(p)
        prec, rec, f1 = cluster_metrics.calc_prec_rec_f1(gnd_assignments=torch.LongTensor(lgolds).to(device), pred_assignments=torch.LongTensor(lpreds).to(device))
        acc = cluster_metrics.calc_ACC(torch.LongTensor(lpreds).to(device), torch.LongTensor(lgolds).to(device))
        silhouette, davies_bouldin = sklearn.metrics.silhouette_score(latent_z1s, pred1s, metric='euclidean'), sklearn.metrics.davies_bouldin_score(latent_z1s, pred1s)

        print('{} epoch {}, eval prec={:.4f} rec={:.4f} f1={:.4f} acc={:.4f} sil={:.4f}, db={:.4f}'.format(
            datetime.datetime.now(), epoch, prec, rec, f1, acc, silhouette, davies_bouldin))

    if args.save_model_path is not None:
        pred1s = torch.from_numpy(pred1s)
        if pred2s is not None:
            pred2s = torch.from_numpy(pred2s)
        state = {
            'model_state': model.state_dict(),
            'id_to_token': dataset.id_to_token,
            'word_emb_size': word_emb_size,
            'v1_assignments': pred1s,
            'v2_assignments': pred2s,
            'pred1s_perm_idx': pred1s_perm_idx,
            'pred2s_perm_idx': pred2s_perm_idx
        }
        with open(expand(args.save_model_path), 'wb') as f:
            torch.save(state, f)
        print('saved model to ', args.save_model_path)