Exemplo n.º 1
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            temperature: int = 10,
            batch_size: int = None,
            **kwargs
    ):
        labeled_data = load_data_jsonl(
            labeled_file_path,
        )

        unlabeled_data = load_data_jsonl(
            unlabeled_file_path,
        )

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data]))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk]))
            embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0)
            w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32)
            all_recovered += get_nKNN_pseudo_labels(w, labeled_data, unlabeled_data_chunk, temperature=temperature)

        return all_recovered
    def find_pseudo_labels(self,
                           labeled_file_path: str,
                           unlabeled_file_path: str,
                           temperature: int = 10,
                           batch_size: int = None,
                           **kwargs):
        labeled_data = load_data_jsonl(labeled_file_path, )

        unlabeled_data = load_data_jsonl(unlabeled_file_path, )

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(
            self.embedder.embed_sentences([d['input'] for d in labeled_data]))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(
                f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(
                self.embedder.embed_sentences(
                    [d['input'] for d in unlabeled_data_chunk]))
            embeddings = np.concatenate(
                (labeled_embeddings, unlabeled_embeddings), axis=0)

            nn = NearestNeighbors(n_neighbors=10, metric='cosine')
            nn.fit(embeddings)
            graph = nn.kneighbors_graph().toarray()
            w = (graph.T + graph > 0).astype(int)

            # D
            d = np.diag(w.sum(0))
            d_half = fractional_matrix_power(d, -0.5)

            # Normalized laplacian
            l_sym = np.eye(len(w)) - d_half @ w @ d_half

            # Eigen decomposition
            eigs = eigh(l_sym, eigvals=(1, min(31, len(l_sym) - 1)))
            normed_eigs = eigs[1] / np.sqrt(eigs[0])

            # W_prime
            w_prime = (normed_eigs @ normed_eigs.T).astype(np.float32)

            all_recovered += get_nKNN_pseudo_labels(w_prime,
                                                    labeled_data,
                                                    unlabeled_data_chunk,
                                                    temperature=temperature)

        return all_recovered
Exemplo n.º 3
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            batch_size: int = None,
            **kwargs
    ):
        self.fit(labeled_file_path)
        unlabeled_data = load_data_jsonl(unlabeled_file_path)

        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        recovered = list()

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            X = self.embedder.embed_sentences([str(d['sentence']) for d in unlabeled_data_chunk])
            predictions = self.logreg.predict_proba(X)
            pseudo_labels = predictions.argmax(1)
            pseudo_labels_scores = predictions.max(1)

            for original_data, pseudo_label, pseudo_label_score in zip(
                    unlabeled_data_chunk, pseudo_labels, pseudo_labels_scores):
                recovered.append(dict(
                    data=original_data.copy(),
                    pseudo_label=self.labels_vocab(pseudo_label, rev=True),
                    pseudo_label_score=float(pseudo_label_score)
                ))
        return recovered
Exemplo n.º 4
0
 def fit(self, path):
     train_data = load_data_jsonl(path)
     X = self.embedder.embed_sentences([str(d['sentence']) for d in train_data])
     self.logreg = LogisticRegression(C=100.0)
     self.labels_vocab = Vocab([d['label'] for d in train_data])
     y = [self.labels_vocab(d['label']) for d in train_data]
     self.logreg.fit(X, y)
Exemplo n.º 5
0
 def fit(self, path):
     train_data = load_data_jsonl(path)
     self.tfidf = TfidfVectorizer()
     X = self.tfidf.fit_transform([str(d['sentence']) for d in train_data])
     self.logreg = LogisticRegression(C=100.0)
     self.labels_vocab = Vocab([d['label'] for d in train_data])
     y = [self.labels_vocab(d['label']) for d in train_data]
     self.logreg.fit(X, y)
Exemplo n.º 6
0
    def find_pseudo_labels(
            self,
            labeled_file_path: str,
            unlabeled_file_path: str,
            temperature: int = 10,
            batch_size: int = None,
            **kwargs
    ):
        labeled_data = load_data_jsonl(
            labeled_file_path,
        )

        unlabeled_data = load_data_jsonl(
            unlabeled_file_path,
        )
        if not batch_size:
            batch_size = len(unlabeled_data)
        unlabeled_data_chunks = chunks(unlabeled_data, batch_size)
        n_batches = len(range(0, len(unlabeled_data), batch_size))

        all_recovered = list()
        labeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in labeled_data], detached=True))

        for batch_ix, unlabeled_data_chunk in enumerate(unlabeled_data_chunks):
            logger.info(f'Finding pseudo labels for batch {batch_ix + 1}/{n_batches}')
            unlabeled_embeddings = np.array(self.embedder.embed_sentences([d['sentence'] for d in unlabeled_data_chunk], detached=True))
            labels = [d['label'] for d in labeled_data] + ['' for _ in unlabeled_data_chunk]
            labels_vocab = Vocab([d['label'] for d in labeled_data])
            embeddings = np.concatenate((labeled_embeddings, unlabeled_embeddings), axis=0)

            # Build similarity matrix
            w = (1 - pairwise_distances(embeddings, embeddings, metric='cosine')).astype(np.float32)

            # Extracts splits of W for each label. Will be used to compute score
            w_label = dict()
            for label in labels_vocab.labels:
                labelled_global_indices = [ix for ix, d in enumerate(labeled_data) if d['label'] == label]
                w_label[label] = w[:, labelled_global_indices]

            # Build hierarchical tree, bottom to top
            Z = linkage(embeddings, 'ward')
            root_tree = to_tree(Z)

            # Split tree, top to bottom
            trees = get_unique_label_trees(root_tree=root_tree, labels=labels)

            # Recover data
            recovered = list()
            for tree, path in trees:
                output = list()

                # Get all indices in the tree
                order = tree.pre_order()
                tree_labels = [labels[ix] for ix in order]

                # Case when all elements of tree are unlabelled
                if set(tree_labels) == {''}:
                    recovered += output
                    continue

                # Case when samples are mixed (labeled & unlabeled), but with a unique label
                # Get the label
                pseudo_label = [l for o, l in zip(order, tree_labels) if len(labels[o])][0]

                # Iterate over items
                for ix in order:
                    # Case if item is unlabeled
                    if labels[ix] == '':
                        # Compute score
                        global_ix = ix
                        z_i = np.array([
                            w_label[label][global_ix].mean()
                            for label in labels_vocab.labels
                        ])
                        # temperature
                        z_i *= temperature
                        z_i_bar = np.exp(z_i)
                        z_i_bar /= z_i_bar.sum()

                        pseudo_label_score = float(z_i_bar[labels_vocab(pseudo_label)])

                        # Output
                        dat = unlabeled_data_chunk[ix - len(labeled_data)].copy()
                        output.append(dict(
                            data=dat,
                            pseudo_label=pseudo_label,
                            pseudo_label_score=pseudo_label_score
                        ))
                recovered += output
            all_recovered += recovered
        return all_recovered
Exemplo n.º 7
0
def run_proto(train_path,
              model_name_or_path,
              test_input_path=None,
              test_output_path=None,
              refined=False):
    import numpy as np
    from util.data import load_data_jsonl
    import random
    import collections
    import os
    import pickle

    if test_output_path:
        os.makedirs(os.path.dirname(test_output_path), exist_ok=True)
    if test_input_path:
        test_sentences = [
            line.strip() for line in open(test_input_path, 'r').readlines()
            if len(line.strip())
        ]
    else:
        test_sentences = list()
    # train_path = f'data/datasets/Liu/few-shot_final/01/train.jsonl'

    # Load model
    bert = BERTEncoder(model_name_or_path)
    net = Protonet(encoder=bert)
    optimizer = torch.optim.Adam(net.parameters(), lr=2e-5)

    # Load data
    data = load_data_jsonl(train_path)
    print("Data loaded")
    data_dict = collections.defaultdict(list)
    for d in data:
        data_dict[d['label']].append(d['sentence'])
    data_dict = dict(data_dict)
    for k, d in data_dict.items():
        random.shuffle(d)

    labels = sorted(data_dict.keys())
    random.shuffle(labels)
    labels_train = labels[:int(len(labels) / 2)]
    labels_valid = labels[int(len(labels) / 2):]

    print(f"Train Labels ({len(labels_train)}) {labels_train}")
    print(f"Valid Labels ({len(labels_valid)}) {labels_valid}")

    # train_data_dict = {
    #     k: d[:int(0.7 * len(d))] for k, d in data_dict.items()
    # }
    # valid_data_dict = {
    #     k: d[int(0.7 * len(d)):] for k, d in data_dict.items()
    # }
    train_data_dict = {label: data_dict[label] for label in labels_train}
    valid_data_dict = {label: data_dict[label] for label in labels_valid}

    print("Data split. starting training")

    accs = list()
    n_eval_since_last_best = 0
    best_valid_acc = 0.0

    for _ in range(10000):
        loss, loss_dict = train_step(net,
                                     optimizer,
                                     train_data_dict,
                                     refined=refined)
        accs.append(loss_dict['acc'])
        if (_ + 1) % 100 == 0:
            train_acc = np.mean(accs)
            valid_acc = test_step(net, valid_data_dict, refined=refined)
            if valid_acc > best_valid_acc:
                print(
                    f"Train acc={train_acc:.4f} | Valid acc={valid_acc:.4f} (better)"
                )
                n_eval_since_last_best = 0
                best_valid_acc = valid_acc

                if test_input_path:
                    embeddings = list()
                    for i in tqdm.tqdm(range(0, len(test_sentences), 16)):
                        net.eval()
                        with torch.no_grad():
                            embeddings.append(
                                net.encoder.forward(
                                    test_sentences[i:i +
                                                   16]).cpu().detach().numpy())
                    with open(test_output_path, "wb") as file:
                        pickle.dump(embeddings, file)
            else:
                n_eval_since_last_best += 1
                print(
                    f"Train acc={train_acc:.4f} | Valid acc={valid_acc:.4f} (worse, {n_eval_since_last_best})"
                )
        if n_eval_since_last_best >= 5:
            print(f"Early-stopping.")
            break