def write(filename, filename2, rows):
    options_file = 'biomed_elmo_options.json'
    weight_file = 'biomed_elmo_weights.hdf5'
    elmo = Elmo(options_file, weight_file, 1, dropout=0)
    elmo = elmo.to("cuda")

    df = pd.read_csv(filename, nrows=rows)
    x = df["TEXT"].values
    y = df[[
        'No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
        'Airspace Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
        'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
        'Pleural Other', 'Fracture', 'Support Devices'
    ]].values

    for i in range(rows):
        print(str(i) + " of " + str(rows))
        text = x[i]
        token_list = []
        for word in tokenizer(text):
            token_list.append(word)
        for n in range(MAXLEN - len(token_list)):
            token_list.append('PAD')

        token_list = np.array([token_list])
        character_ids = batch_to_ids(token_list)
        character_ids = character_ids.to("cuda")
        word_emb = elmo(character_ids)['elmo_representations'][0]
        character_ids.to("cpu")
        word_emb = word_emb.data.cpu().numpy()
        label = y[i]
        batch = np.array([word_emb, label])
        final_filename = "./Data/" + filename2 + "_instance_" + str(i)
        np.save(final_filename, batch)
Пример #2
0
class SeqPrototypicalNetwork(nn.Module):
    def __init__(self, config):
        super(SeqPrototypicalNetwork, self).__init__()
        self.base_path = config['base_path']
        self.early_stopping = config['early_stopping']
        self.lr = config.get('meta_lr', 1e-3)
        self.weight_decay = config.get('meta_weight_decay', 0.0)

        if 'seq' in config['learner_model']:
            self.learner = RNNSequenceModel(config['learner_params'])
        elif 'mlp' in config['learner_model']:
            self.learner = MLPModel(config['learner_params'])
        elif 'bert' in config['learner_model']:
            self.learner = BERTSequenceModel(config['learner_params'])

        self.num_outputs = config['learner_params']['num_outputs']
        self.vectors = config.get('vectors', 'glove')

        if self.vectors == 'elmo':
            self.elmo = Elmo(
                options_file=
                "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
                weight_file=
                "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
                num_output_representations=1,
                dropout=0,
                requires_grad=False)
        elif self.vectors == 'glove':
            self.glove = torchtext.vocab.GloVe(name='840B', dim=300)
        elif self.vectors == 'bert':
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                'bert-base-cased')

        self.loss_fn = {}
        for task in config['learner_params']['num_outputs']:
            self.loss_fn[task] = nn.CrossEntropyLoss(ignore_index=-1)

        if config.get('trained_learner', False):
            self.learner.load_state_dict(
                torch.load(
                    os.path.join(self.base_path, 'saved_models',
                                 config['trained_learner'])))
            logger.info('Loaded trained learner model {}'.format(
                config['trained_learner']))

        self.device = torch.device(config.get('device', 'cpu'))
        self.to(self.device)

        if self.vectors == 'elmo':
            self.elmo.to(self.device)

        self.initialize_optimizer_scheduler()

    def initialize_optimizer_scheduler(self):
        learner_params = [
            p for p in self.learner.parameters() if p.requires_grad
        ]
        if isinstance(self.learner, BERTSequenceModel):
            self.optimizer = AdamW(learner_params,
                                   lr=self.lr,
                                   weight_decay=self.weight_decay)
            self.lr_scheduler = get_constant_schedule_with_warmup(
                self.optimizer, num_warmup_steps=100)
        else:
            self.optimizer = optim.Adam(learner_params,
                                        lr=self.lr,
                                        weight_decay=self.weight_decay)
            self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                          step_size=500,
                                                          gamma=0.5)

    def vectorize(self, batch_x, batch_len, batch_y):
        with torch.no_grad():
            if self.vectors == 'elmo':
                char_ids = batch_to_ids(batch_x)
                char_ids = char_ids.to(self.device)
                batch_x = self.elmo(char_ids)['elmo_representations'][0]
            elif self.vectors == 'glove':
                max_batch_len = max(batch_len)
                vec_batch_x = torch.ones((len(batch_x), max_batch_len, 300))
                for i, sent in enumerate(batch_x):
                    sent_emb = self.glove.get_vecs_by_tokens(
                        sent, lower_case_backup=True)
                    vec_batch_x[i, :len(sent_emb)] = sent_emb
                batch_x = vec_batch_x.to(self.device)
            elif self.vectors == 'bert':
                max_batch_len = max(batch_len) + 2
                input_ids = torch.zeros((len(batch_x), max_batch_len)).long()
                for i, sent in enumerate(batch_x):
                    sent_token_ids = self.bert_tokenizer.encode(
                        sent, add_special_tokens=True)
                    input_ids[i, :len(sent_token_ids)] = torch.tensor(
                        sent_token_ids)
                batch_x = input_ids.to(self.device)

        batch_len = torch.tensor(batch_len).to(self.device)
        batch_y = torch.tensor(batch_y).to(self.device)
        return batch_x, batch_len, batch_y

    def forward(self, episodes, updates=1, testing=False):
        query_losses, query_accuracies, query_precisions, query_recalls, query_f1s = [], [], [], [], []
        n_episodes = len(episodes)

        for episode_id, episode in enumerate(episodes):

            batch_x, batch_len, batch_y = next(iter(episode.support_loader))
            batch_x, batch_len, batch_y = self.vectorize(
                batch_x, batch_len, batch_y)

            self.train()
            support_repr, support_label = [], []

            batch_x_repr = self.learner(batch_x, batch_len)
            support_repr.append(batch_x_repr)
            support_label.append(batch_y)

            prototypes = self._build_prototypes(support_repr, support_label,
                                                episode.n_classes)

            # Run on query
            query_loss = 0.0
            all_predictions, all_labels = [], []
            for module in self.learner.modules():
                if isinstance(module, nn.Dropout):
                    module.eval()

            for n_batch, (batch_x, batch_len,
                          batch_y) in enumerate(episode.query_loader):
                batch_x, batch_len, batch_y = self.vectorize(
                    batch_x, batch_len, batch_y)
                batch_x_repr = self.learner(batch_x, batch_len)
                output = self._normalized_distances(prototypes, batch_x_repr)
                output = output.view(output.size()[0] * output.size()[1], -1)
                batch_y = batch_y.view(-1)
                loss = self.loss_fn[episode.base_task](output, batch_y)
                query_loss += loss.item()

                if not testing:
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    self.lr_scheduler.step()

                relevant_indices = torch.nonzero(
                    batch_y != -1).view(-1).detach()
                all_predictions.extend(
                    make_prediction(output[relevant_indices]).cpu())
                all_labels.extend(batch_y[relevant_indices].cpu())

            query_loss /= n_batch + 1

            # Calculate metrics
            accuracy, precision, recall, f1_score = utils.calculate_metrics(
                all_predictions, all_labels, binary=False)

            logger.info(
                'Episode {}/{}, task {} [query set]: Loss = {:.5f}, accuracy = {:.5f}, precision = {:.5f}, '
                'recall = {:.5f}, F1 score = {:.5f}'.format(
                    episode_id + 1, n_episodes, episode.task_id, query_loss,
                    accuracy, precision, recall, f1_score))

            query_losses.append(query_loss)
            query_accuracies.append(accuracy)
            query_precisions.append(precision)
            query_recalls.append(recall)
            query_f1s.append(f1_score)

        return query_losses, query_accuracies, query_precisions, query_recalls, query_f1s

    def _build_prototypes(self, data_repr, data_label, num_outputs):
        n_dim = data_repr[0].shape[2]
        data_repr = torch.cat(tuple([x.view(-1, n_dim) for x in data_repr]),
                              dim=0)
        data_label = torch.cat(tuple([y.view(-1) for y in data_label]), dim=0)

        prototypes = torch.zeros((num_outputs, n_dim), device=self.device)

        for c in range(num_outputs):
            idx = torch.nonzero(data_label == c).view(-1)
            if idx.nelement() != 0:
                prototypes[c] = torch.mean(data_repr[idx], dim=0)

        return prototypes

    def _normalized_distances(self, prototypes, q):
        d = torch.stack(tuple(
            [q.sub(p).pow(2).sum(dim=-1) for p in prototypes]),
                        dim=-1)
        return d.neg()
Пример #3
0
class NearestNeighborClassifier():
    def __init__(self, config):
        self.vectors = config.get('vectors', 'elmo')
        self.device = torch.device(config.get('device', 'cpu'))

        if self.vectors == 'elmo':
            self.elmo = Elmo(
                options_file=
                "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
                weight_file=
                "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
                num_output_representations=1,
                dropout=0,
                requires_grad=False)
            self.elmo.to(self.device)
        elif self.vectors == 'glove':
            self.glove = torchtext.vocab.GloVe(name='840B', dim=300)
        elif self.vectors == 'bert':
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                'bert-base-cased')
            self.bert = BertModel.from_pretrained('bert-base-cased')
            self.bert.to(self.device)

        logger.info('Nearest neighbor classifier instantiated')

    def vectorize(self, batch_x, batch_len, batch_y):
        with torch.no_grad():
            if self.vectors == 'elmo':
                char_ids = batch_to_ids(batch_x)
                char_ids = char_ids.to(self.device)
                batch_x = self.elmo(char_ids)['elmo_representations'][0]
            elif self.vectors == 'glove':
                max_batch_len = max(batch_len)
                vec_batch_x = torch.ones((len(batch_x), max_batch_len, 300))
                for i, sent in enumerate(batch_x):
                    sent_emb = self.glove.get_vecs_by_tokens(
                        sent, lower_case_backup=True)
                    vec_batch_x[i, :len(sent_emb)] = sent_emb
                batch_x = vec_batch_x.to(self.device)
            elif self.vectors == 'bert':
                max_batch_len = max(batch_len) + 2
                input_ids = torch.zeros((len(batch_x), max_batch_len)).long()
                for i, sent in enumerate(batch_x):
                    sent_token_ids = self.bert_tokenizer.encode(
                        sent, add_special_tokens=True)
                    input_ids[i, :len(sent_token_ids)] = torch.tensor(
                        sent_token_ids)
                batch_x = input_ids.to(self.device)
                attention_mask = (batch_x.detach() != 0).float()
                batch_x, _ = self.bert(batch_x, attention_mask=attention_mask)
                batch_x = batch_x[:, 1:-1, :]
        batch_len = torch.tensor(batch_len).to(self.device)
        batch_y = torch.tensor(batch_y).to(self.device)
        return batch_x, batch_len, batch_y

    def training(self, train_episodes, val_episodes):
        return 0

    def testing(self, test_episodes):
        episode_accuracies, episode_precisions, episode_recalls, episode_f1s = [], [], [], []
        for episode_id, episode in enumerate(test_episodes):
            batch_x, batch_len, batch_y = next(iter(episode.support_loader))
            support_repr, _, support_labels = self.vectorize(
                batch_x, batch_len, batch_y)
            support_repr = support_repr.reshape(
                support_repr.shape[0] * support_repr.shape[1], -1)
            support_labels = support_labels.view(-1)
            support_repr = support_repr[support_labels != -1].cpu().numpy()
            support_labels = support_labels[support_labels != -1].cpu().numpy()

            batch_x, batch_len, batch_y = next(iter(episode.query_loader))
            query_repr, _, true_labels = self.vectorize(
                batch_x, batch_len, batch_y)
            query_repr = query_repr.reshape(
                query_repr.shape[0] * query_repr.shape[1], -1)
            true_labels = true_labels.view(-1)
            query_repr = query_repr[true_labels != -1].cpu().numpy()
            true_labels = true_labels[true_labels != -1].cpu().numpy()

            dist = cdist(query_repr, support_repr, metric='cosine')
            nearest_neighbor = np.argmin(dist, axis=1)
            predictions = support_labels[nearest_neighbor]

            accuracy = metrics.accuracy_score(true_labels, predictions)
            precision = metrics.precision_score(true_labels,
                                                predictions,
                                                average='macro')
            recall = metrics.recall_score(true_labels,
                                          predictions,
                                          average='macro')
            f1_score = metrics.f1_score(true_labels,
                                        predictions,
                                        average='macro')
            logger.info(
                'Episode {}/{}, task {} [query set]: Accuracy = {:.5f}, precision = {:.5f}, '
                'recall = {:.5f}, F1 score = {:.5f}'.format(
                    episode_id + 1, len(test_episodes), episode.task_id,
                    accuracy, precision, recall, f1_score))

            episode_accuracies.append(accuracy)
            episode_precisions.append(precision)
            episode_recalls.append(recall)
            episode_f1s.append(f1_score)

        logger.info(
            'Avg meta-testing metrics: Accuracy = {:.5f}, precision = {:.5f}, recall = {:.5f}, '
            'F1 score = {:.5f}'.format(np.mean(episode_accuracies),
                                       np.mean(episode_precisions),
                                       np.mean(episode_recalls),
                                       np.mean(episode_f1s)))

        return np.mean(episode_f1s)