Пример #1
0
    def test(self, word_ids, sentiment):
        self._net.train(False)
        word_ids = word_ids.astype(np.int64)

        word_ids_tensor = gpu(torch.from_numpy(word_ids), self._use_cuda)
        sent_tensor = gpu(
            torch.from_numpy(np.asarray(sentiment).astype(np.float32)),
            self._use_cuda)
        epoch_loss = 0.0
        epoch_acc = 0.0
        for (minibatch_num, (batch_word, batch_sent)) in enumerate(
                minibatch_sentences(self._batch_size, word_ids_tensor,
                                    sent_tensor)):
            word_var = Variable(batch_word)
            sent_var = Variable(batch_sent.squeeze(), requires_grad=False)
            predictions = self._net(word_var)
            preds = accuracy_one(predictions.data)
            loss = self._loss(predictions, sent_var)
            epoch_loss = epoch_loss + loss.data[0]
            epoch_acc += torch.sum(preds != sent_var.data.byte()) / len(
                sent_var.data)
        epoch_loss = epoch_loss / (minibatch_num + 1)
        epoch_acc = epoch_acc / (minibatch_num + 1)

        return epoch_loss, epoch_acc
Пример #2
0
    def _select_user_user_sppmi_input(self, batch_user, done, user_user_sppmi):
        """

        Parameters
        ----------
         batch_user: :class:`torch.Tensor` shape (batch_size, )
         visited_users: :class:`set`
            id of visited users for network information
         adjNetwork: :class:`dict`
            key: userID `int` and values are adjacent vertices

        Returns
        -------
        """
        # network
        targets = []
        T = torch_utils.tensor2numpy(batch_user)
        for u in T:
            if u in done: continue
            done.add(u)
            targets.append(u)
        if len(targets) == 0:
            return []
        targets = np.array(targets)
        selected = user_user_sppmi[targets]

        user_indices = torch_utils.numpy2tensor(np.array(targets), dtype = torch.long)
        selected = torch_utils.numpy2tensor(np.array(selected), dtype = torch.float)

        sppmi = [torch_utils.gpu(user_indices, gpu = self._use_cuda), torch_utils.gpu(selected, self._use_cuda)]
        return sppmi
Пример #3
0
    def load_best_model_test2_test3(self, test2: interactions.MatchInteraction,
                                    test3: interactions.MatchInteraction,
                                    topN: int):
        mymodel = self._net
        # print("Trained model: ", mymodel.out.weight)
        mymodel.load_state_dict(torch.load(self.saved_model))
        mymodel.train(False)
        my_utils.gpu(mymodel, self._use_cuda)

        assert len(test2.unique_queries_test) in KeyWordSettings.QueryCountTest
        result_test2, error_analysis_val = self.evaluate(test2,
                                                         topN,
                                                         output_ranking=True)
        hits_test2 = result_test2["hits"]
        ndcg_test2 = result_test2["ndcg"]
        ndcg_at_1_test2 = result_test2["ndcg@1"]

        FileHandler.save_error_analysis_test2(
            json.dumps(error_analysis_val, sort_keys=True, indent=2))
        FileHandler.myprint(
            'Best Test2_hard hits@%d = %.5f | Best Test2_hard ndcg@%d = %.5f '
            '|Best Test2_hard ndcg@1 = %.5f ' %
            (topN, hits_test2, topN, ndcg_test2, ndcg_at_1_test2))

        return hits_test2, ndcg_test2
Пример #4
0
    def load_best_model_single(
            self, target_interactions: interactions.MatchInteraction,
            topN: int):
        """ Note: This function is used for Heat map visualization only. """
        mymodel = self._net
        # print("Trained model: ", mymodel.out.weight)
        mymodel.load_state_dict(torch.load(self.saved_model))
        mymodel.train(False)
        my_utils.gpu(mymodel, self._use_cuda)

        # assert len(val_interactions.unique_queries_test) in KeyWordSettings.QueryCountVal
        result_val, error_analysis_val = self.evaluate(target_interactions,
                                                       topN,
                                                       output_ranking=True)
        hits = result_val["hits"]
        ndcg = result_val["ndcg"]
        ndcg_at_1 = result_val["ndcg@1"]

        FileHandler.save_error_analysis_validation(
            json.dumps(error_analysis_val, sort_keys=True, indent=2))
        # FileHandler.save_error_analysis_testing(json.dumps(error_analysis_test, sort_keys = True, indent = 2))
        FileHandler.myprint(
            'Best Target hits@%d = %.5f | Best Target ndcg@%d = %.5f '
            '|Best Target ndcg@1 = %.5f' % (topN, hits, topN, ndcg, ndcg_at_1))

        return hits, ndcg
Пример #5
0
    def _prepare_network_input(self, batch_user, visited_users, adjNetwork):
        """

        Parameters
        ----------
         batch_user: :class:`torch.Tensor` shape (batch_size, )
         visited_users: :class:`set`
            id of visited users for network information
         adjNetwork: :class:`dict`
            key: userID `int` and values are adjacent vertices

        Returns
        -------
        """
        # network
        targets, labels = [], []
        T = torch_utils.tensor2numpy(batch_user)
        for u in T:
            if u in visited_users: continue
            visited_users.add(u)
            neighbors = adjNetwork.get(u, [0] * self._n_users)
            targets.append(u)
            labels.append(neighbors)
        user_indices = torch_utils.numpy2tensor(np.array(targets), dtype = torch.long)
        labels = torch_utils.numpy2tensor(np.array(labels), dtype = torch.float)
        network = [torch_utils.gpu(user_indices, gpu = True), torch_utils.gpu(labels, True)]
        return network
 def predict(self, user_ids, item_ids):
     self._net.train(False)
     user_ids = user_ids.astype(np.int64)
     item_ids = item_ids.astype(np.int64)
     user_ids_tensor = gpu(torch.from_numpy(user_ids), self._use_cuda)
     item_ids_tensor = gpu(torch.from_numpy(item_ids), self._use_cuda)
     user_var = Variable(user_ids_tensor)
     item_var = Variable(item_ids_tensor)
     return self._net(user_var, item_var).data.numpy()
Пример #7
0
    def fit(self,
            word_ids,
            sentiment,
            word_ids_test,
            sentiment_test,
            verbose=True):

        word_ids = word_ids.astype(np.int64)
        word_ids_test = word_ids_test.astype(np.int64)

        if not self._initialized:
            self._initialize()

        self._net.train(True)

        for epoch_num in range(self._n_iter):

            words, sents = shuffle_sentences(
                word_ids,
                np.asarray(sentiment).astype(np.float32))
            word_ids_tensor = gpu(torch.from_numpy(words), self._use_cuda)
            sent_tensor = gpu(torch.from_numpy(sents), self._use_cuda)
            epoch_loss = 0.0
            epoch_acc = 0.0
            for (minibatch_num, (batch_word, batch_sent)) in enumerate(
                    minibatch_sentences(self._batch_size, word_ids_tensor,
                                        sent_tensor)):
                word_var = Variable(batch_word)
                sent_var = Variable(batch_sent.squeeze(), requires_grad=False)
                predictions = self._net(word_var)
                #print(predictions)
                preds = accuracy_one(predictions.data)

                self._optimizer.zero_grad()

                loss = self._loss(predictions, sent_var)

                epoch_loss = epoch_loss + loss.data[0]
                epoch_acc += torch.sum(preds != sent_var.data.byte()) / len(
                    sent_var.data)

                loss.backward()

                self._optimizer.step()

            epoch_loss = epoch_loss / (minibatch_num + 1)
            epoch_acc = epoch_acc / (minibatch_num + 1)

            if verbose:
                val_loss, val_acc = self.test(word_ids_test, sentiment_test)
                #val_loss = 0
                #val_acc = 0
                print('Epoch {}: train loss {}'.format(epoch_num, epoch_loss),
                      'train acc', epoch_acc, 'validation loss', val_loss,
                      'validation acc', val_acc)
                self._net.train(True)
    def fit(self,
            user_ids,
            item_ids,
            ratings,
            user_ids_test,
            item_ids_test,
            ratings_test,
            verbose=True):

        user_ids = user_ids.astype(np.int64)
        item_ids = item_ids.astype(np.int64)
        user_ids_test = user_ids_test.astype(np.int64)
        item_ids_test = item_ids_test.astype(np.int64)

        if not self._initialized:
            self._initialize()

        for epoch_num in range(self._n_iter):
            users, items, ratingss = shuffle(user_ids, item_ids, ratings)

            user_ids_tensor = gpu(torch.from_numpy(users), self._use_cuda)
            item_ids_tensor = gpu(torch.from_numpy(items), self._use_cuda)
            ratings_tensor = gpu(torch.from_numpy(ratingss), self._use_cuda)
            epoch_loss = 0.0

            for (minibatch_num,
                 (batch_user, batch_item, batch_ratings)) in enumerate(
                     minibatch(self._batch_size, user_ids_tensor,
                               item_ids_tensor, ratings_tensor)):
                user_var = Variable(batch_user)
                item_var = Variable(batch_item)
                ratings_var = Variable(batch_ratings)

                predictions = self._net(user_var, item_var)

                self._optimizer.zero_grad()

                loss = self._loss_func(ratings_var, predictions)

                epoch_loss = epoch_loss + loss.data[0]

                loss.backward()
                self._optimizer.step()

            epoch_loss = epoch_loss / (minibatch_num + 1)

            if verbose:
                val_loss = self.test(user_ids_test, item_ids_test,
                                     ratings_test)
                print('Epoch {}: train loss {}'.format(epoch_num, epoch_loss),
                      'validation loss', val_loss)
                self._net.train(True)

            if np.isnan(epoch_loss) or epoch_loss == 0.0:
                raise ValueError(
                    'Degenerate epoch loss: {}'.format(epoch_loss))
    def _get_loss(self, query_ids: torch.Tensor, query_contents: torch.Tensor,
                  doc_ids: torch.Tensor, doc_contents: torch.Tensor,
                  query_lens: np.ndarray, docs_lens: np.ndarray,
                  target_contents, **kargs) -> torch.Tensor:
        """
        Compute loss for batch_size pairs. Note: Query and Doc have different lengths

        :param query_ids: (B, )
        :param query_contents: (B, L)
        :param doc_ids: (B, )
        :param doc_contents: (B, R)
        :param query_lens: (B, )
        :param docs_lens: (B, )
        :param target_contents: (B, R)
        :param kargs:
        :return:
        """
        batch_size = query_ids.size(0)
        L2 = doc_contents.size(1)
        L1 = query_contents.size(1)

        q_new_indices, q_restoring_indices = torch_utils.get_sorted_index_and_reverse_index(
            query_lens)
        query_lens = my_utils.gpu(torch.from_numpy(query_lens), self._use_cuda)

        d_new_indices, d_old_indices = torch_utils.get_sorted_index_and_reverse_index(
            docs_lens)
        docs_lens = my_utils.gpu(torch.from_numpy(docs_lens), self._use_cuda)
        additional_paramters = {
            KeyWordSettings.Query_lens:
            query_lens,
            KeyWordSettings.QueryLensIndices:
            (q_new_indices, q_restoring_indices, query_lens),
            KeyWordSettings.Doc_lens:
            docs_lens,
            KeyWordSettings.DocLensIndices:
            (d_new_indices, d_old_indices, docs_lens),
            KeyWordSettings.UseCuda:
            self._use_cuda
        }
        logits = self._net(query_contents, doc_contents, query_lens, None,
                           **additional_paramters)
        num_classes = len(self._vocab._state['term_index'])
        logits = logits.view(-1, num_classes)  # (B * R, C)
        target_contents = target_contents.view(-1)  # (B, R) => (B * R)
        loss = F.cross_entropy(logits,
                               target_contents,
                               ignore_index=self.index_of_pad_token)
        return loss
Пример #10
0
    def _initialize(self, interactions):
        """

        Parameters
        ----------
        interactions: :class:`interactions.Interactions`
        Returns
        -------

        """
        self._n_users, self._n_items = interactions.num_users, interactions.num_items
        if self._net_type == "gau":
            self._net = my_nets.GAU(self._n_users, self._n_items, self._embedding_dim)

        # put the model into cuda if use cuda
        self._net = my_utils.gpu(self._net, self._use_cuda)

        if self._optimizer_func is None:
            self._optimizer = optim.Adam(
                self._net.parameters(),
                weight_decay = self._reg_l2,
                lr = self._learning_rate)
        else:
            self._optimizer = self._optimizer_func(self._net.parameters())

        # losses functions
        self._loss_func = my_losses.single_pointwise_square_loss
        print("Using: ", self._loss_func)
    def test(self, user_ids, item_ids, ratings):
        self._net.train(False)
        user_ids = user_ids.astype(np.int64)
        item_ids = item_ids.astype(np.int64)

        user_ids_tensor = gpu(torch.from_numpy(user_ids), self._use_cuda)
        item_ids_tensor = gpu(torch.from_numpy(item_ids), self._use_cuda)
        ratings_tensor = gpu(torch.from_numpy(ratings), self._use_cuda)

        user_var = Variable(user_ids_tensor)
        item_var = Variable(item_ids_tensor)
        ratings_var = Variable(ratings_tensor)

        predictions = self._net(user_var, item_var)

        loss = self._loss_func(ratings_var, predictions)
        return loss.data[0]
Пример #12
0
	def _initialize(self):
		if self._net is None:
			self._net = gpu(FirstModel(self._embedding_dim,self._vocab_size,self._seq_len),self._use_cuda)
			
		self._optimizer = optim.Adam(self._net.parameters(),lr=self._learning_rate, weight_decay=0)
				
		if self._loss is None:
			self._loss = torch.nn.BCELoss()
Пример #13
0
 def _get_negative_prediction(self, user_ids):
     """ Code from Spotlight """
     negative_items = self._sampler.random_sample_items(
         self._n_items,
         len(user_ids),
         random_state=self._random_state)
     negative_var = my_utils.gpu(torch.from_numpy(negative_items), self._use_cuda)
     negative_prediction = self._net(user_ids, negative_var)
     return negative_prediction
    def evaluate(self,
                 testRatings: interactions.MatchInteraction,
                 output_ranking=False,
                 **kargs):
        self._net.train(False)  # disabling training
        query_ids, left_contents, left_lengths, \
        doc_ids, right_contents, target_contents, right_lengths = self._sampler.get_instances(testRatings)
        eval_loss = 0.0
        total_tokens = 0
        for (minibatch_num,
             (batch_query, batch_query_content, batch_query_len,
              batch_doc, batch_doc_content, batch_doc_target, batch_docs_lens)) \
                in enumerate(my_utils.minibatch(query_ids, left_contents, left_lengths,
                                                doc_ids, right_contents, target_contents, right_lengths,
                                                batch_size=self._batch_size)):

            batch_query = my_utils.gpu(torch.from_numpy(batch_query),
                                       self._use_cuda)
            batch_query_content = my_utils.gpu(
                torch.from_numpy(batch_query_content), self._use_cuda)
            # batch_query_len = my_utils.gpu(torch.from_numpy(batch_query_len), self._use_cuda)
            batch_doc = my_utils.gpu(torch.from_numpy(batch_doc),
                                     self._use_cuda)
            batch_doc_content = my_utils.gpu(
                torch.from_numpy(batch_doc_content), self._use_cuda)
            batch_doc_target = my_utils.gpu(torch.from_numpy(batch_doc_target),
                                            self._use_cuda)
            # batch_docs_lens = my_utils.gpu(torch.from_numpy(batch_docs_lens), self._use_cuda)
            batch_loss = self._get_loss(batch_query, batch_query_content,
                                        batch_doc, batch_doc_content,
                                        batch_query_len, batch_docs_lens,
                                        batch_doc_target)
            mask = (batch_doc_target != self.index_of_pad_token)
            non_pad_tokens = torch.sum(mask).float()
            loss = batch_loss.data.cpu().numpy()
            loss *= non_pad_tokens
            eval_loss += loss
            total_tokens += non_pad_tokens

        eval_loss /= total_tokens
        results = dict()
        results["cross_entropy"] = eval_loss
        return results
    def load_best_model(self, val_interactions: interactions.MatchInteraction,
                        test_interactions: interactions.MatchInteraction):
        mymodel = self._net
        mymodel.load_state_dict(torch.load(self.saved_model))
        mymodel.train(False)
        my_utils.gpu(mymodel, self._use_cuda)

        val_results = self.evaluate(val_interactions)
        val_loss = val_results["cross_entropy"]
        test_results = self.evaluate(test_interactions)
        test_loss = test_results["cross_entropy"]

        FileHandler.save_error_analysis_validation(
            json.dumps(val_results, sort_keys=True, indent=2))
        FileHandler.save_error_analysis_testing(
            json.dumps(test_results, sort_keys=True, indent=2))
        FileHandler.myprint('Best val loss = %.5f |Best Test loss = %.5f ' %
                            (val_loss, test_loss))

        return val_loss, test_loss
	def _get_negative_prediction(self, user_ids):

		negative_items = sample_items(
			self._num_items,
			len(user_ids),
			random_state=self._random_state)
		negative_var = Variable(
			gpu(torch.from_numpy(negative_items), self._use_cuda)
		)
		negative_prediction = self._net(user_ids, negative_var)

		return negative_prediction
    def _initialize(self):
        if self._net is None:
            self._net = gpu(
                DotModel(self._num_users, self._num_items,
                         self._embedding_dim), self._use_cuda)

        self._optimizer = optim.Adam(self._net.parameters(),
                                     lr=self._learning_rate,
                                     weight_decay=self._l2)

        if self._loss_func is None:
            self._loss_func = regression_loss
Пример #18
0
    def load_best_model(self, val_interactions: interactions.MatchInteraction,
                        test_interactions: interactions.MatchInteraction,
                        topN: int):
        mymodel = self._net
        # print("Trained model: ", mymodel.out.weight)
        mymodel.load_state_dict(torch.load(self.saved_model))
        mymodel.train(False)
        my_utils.gpu(mymodel, self._use_cuda)

        assert len(val_interactions.unique_queries_test
                   ) in KeyWordSettings.QueryCountVal
        result_val, error_analysis_val = self.evaluate(val_interactions,
                                                       topN,
                                                       output_ranking=True)
        hits = result_val["hits"]
        ndcg = result_val["ndcg"]
        ndcg_at_1 = result_val["ndcg@1"]

        assert len(test_interactions.unique_queries_test
                   ) in KeyWordSettings.QueryCountTest
        result_test, error_analysis_test = self.evaluate(test_interactions,
                                                         topN,
                                                         output_ranking=True)
        hits_test = result_test["hits"]
        ndcg_test = result_test["ndcg"]
        ndcg_at_1_test = result_test["ndcg@1"]

        FileHandler.save_error_analysis_validation(
            json.dumps(error_analysis_val, sort_keys=True, indent=2))
        FileHandler.save_error_analysis_testing(
            json.dumps(error_analysis_test, sort_keys=True, indent=2))
        FileHandler.myprint(
            'Best Vad hits@%d = %.5f | Best Vad ndcg@%d = %.5f '
            '|Best Test hits@%d = %.5f |Best Test ndcg@%d = %.5f'
            '|Best Vad ndcg@1 = %.5f |Best Test ndcg@1 = %.5f' %
            (topN, hits, topN, ndcg, topN, hits_test, topN, ndcg_test,
             ndcg_at_1, ndcg_at_1_test))

        return hits, ndcg, hits_test, ndcg_test
    def decoder(self, decoder_inputs: torch.Tensor, init_states: torch.Tensor,
                encoder_outputs: torch.Tensor, encoder_decoder_veracities, **kargs):
        """
        Decoder with attention over pad_packed_sequence outputted from encoder. We need to loop step by step
        in the decoder. Usually, we can input the whole sequence into the decoder and let it run quickly.
        However, we are now using attention mechanism to derive context of each time-step in decoder.
        Therefore, we need to loop step-by-step based on input sequence of decoder.

        :param decoder_inputs: shape (batch_size, seq_length)
        :param init_states: This is the last hidden states outputted from Encoder
        :param encoder_outputs: This is pad_packed_sequence tensor with size (batch_size, seq_len, hidden_size)
        :param decoder_input_lengths: shape (batch_size, 1)
        :return:
        """
        batch_size = decoder_inputs.size(0)
        max_target_length = decoder_inputs.size(1) # seq_length of d-tweets

        all_decoder_outputs = Variable(torch.zeros(max_target_length, batch_size, self.decoder_att.output_size))
        use_cuda = kargs[KeyWordSettings.UseCuda]
        all_decoder_outputs = torch_utils.gpu(all_decoder_outputs, use_cuda)

        decoder_hidden = init_states
        input_feed_init = torch.zeros(batch_size, self.decoder_att.hidden_size)
        input_feed_init = torch_utils.gpu(input_feed_init, use_cuda)

        # we start from 0
        for t in range(max_target_length):
            tic = time.time()
            decoder_input = decoder_inputs[:, t]  # Next input is current target
            # we need to change hidden_state after each step.
            decoder_output, decoder_hidden, decoder_attn, input_feed_init = self.decoder_att(
                decoder_input, decoder_hidden, encoder_outputs, encoder_decoder_veracities, input_feed_init)

            toc = time.time()
            all_decoder_outputs[t] = decoder_output # shape (batch_size, self.decoder_att.output_size)

        all_decoder_outputs = all_decoder_outputs.permute(1, 0, 2) # (B, max_target_length, decoder_output_size)
        logits = self.outputs2vocab(all_decoder_outputs)
        return logits, decoder_hidden
Пример #20
0
    def _initialize(self, interactions: interactions.MatchInteraction):
        """

        Parameters
        ----------
        interactions: :class:`interactions.MatchInteraction`
        Returns
        -------

        """
        # put the model into cuda if use cuda
        self._net = my_utils.gpu(self._net, self._use_cuda)

        if self._optimizer_func is None:
            self._optimizer = optim.Adam(self._net.parameters(),
                                         weight_decay=self._reg_l2,
                                         lr=self._learning_rate)
        else:
            self._optimizer = self._optimizer_func(self._net.parameters())

        # losses functions
        if self._loss == 'pointwise':
            self._loss_func = my_losses.pointwise_loss
        elif self._loss == "single_pointwise_square_loss":
            self._loss_func = my_losses.single_pointwise_square_loss
        elif self._loss == 'bpr':
            self._loss_func = my_losses.bpr_loss
        elif self._loss == 'hinge':
            self._loss_func = my_losses.hinge_loss
        elif self._loss == 'bce':  # binary cross entropy
            self._loss_func = my_losses.pointwise_bceloss
        elif self._loss == "pce":
            self._loss_func = my_losses.positive_cross_entropy
        elif self._loss == "cosine_max_margin_loss_dvsh":
            self._loss_func = my_losses.cosine_max_margin_loss_dvsh
        elif self._loss == "cross_entropy":
            self._loss_func = my_losses.binary_cross_entropy_cls
        elif self._loss == "masked_cross_entropy":
            self._loss_func = my_losses.masked_binary_cross_entropy
        elif self._loss == "vanilla_cross_entropy":
            self._loss_func = my_losses.vanilla_cross_entropy
        elif self._loss == "regression_loss":
            self._loss_func = my_losses.regression_loss
        else:
            self._loss_func = my_losses.adaptive_hinge_loss
        FileHandler.myprint("Using: " + str(self._loss_func))
Пример #21
0
	def __init__(self, embedding_dim = 30, n_iter = 2, batch_size = 64,
		learning_rate = 1e-3, 
		net = None, loss = None, use_cuda=False,
		vocab_size = 1, seq_len = 1):
		self._embedding_dim = embedding_dim
		self._n_iter = n_iter
		self._batch_size = batch_size
		self._learning_rate = learning_rate
		self._use_cuda = use_cuda
		
		if net != None:
			self._net = gpu(net,use_cuda)
		else:
			self._net = None
		self._loss = loss
		self._optimizer = None
		self._vocab_size = vocab_size
		self._seq_len = seq_len
Пример #22
0
    def forward(self,
                query: torch.Tensor,
                document: torch.Tensor,
                verbose=False,
                **kargs):
        """Forward. of integer query tensor and document tensor """
        max_left_len, max_right_len = query.size(1), document.size(1)
        # Process left & right input.
        # https://github.com/AdeDZY/K-NRM/blob/master/knrm/model/model_base.py#L96
        tensor_mask = torch_utils.create_mask_tensor(query,
                                                     document,
                                                     threshold=1)
        doc_mask = (document > 0).float()
        query_mask = (query > 0).float()  # B, L
        embed_query = self.src_word_emb(query.long())  # (B, L, D)
        embed_doc = self.src_word_emb(document.long())  # (B, R, D)
        # normalizing vectors
        embed_query = F.normalize(embed_query, p=2, dim=-1)
        embed_doc = F.normalize(embed_doc, p=2, dim=-1)
        ################################# For Contextualized Representation using ELMO #############################
        query_ids = kargs[KeyWordSettings.QueryIDs]  # (B, )
        doc_ids = kargs[KeyWordSettings.DocIDs]  # (B, )
        assert query_ids.shape == doc_ids.shape
        use_cuda = kargs[KeyWordSettings.UseCuda]
        query_char_repr = self.left_elmo_tensor[query_ids]
        doc_char_repr = self.right_elmo_tensor[doc_ids]
        # I have to load to gpu at this step because left_tensor is too large to load to GPU
        query_char_repr = torch_utils.gpu(query_char_repr,
                                          use_cuda)  # (B, L, D1)
        doc_char_repr = torch_utils.gpu(doc_char_repr, use_cuda)  # (B, R, D1)
        assert query_char_repr.size(1) == embed_query.size(1)
        assert doc_char_repr.size(1) == embed_doc.size(1)
        ###############################################################################################
        q_convs, d_convs = [], []
        q_ctx_convs, d_ctx_convs = [], []
        for q_conv, d_conv, \
            q_context_conv, d_context_conv in zip(self.q_convs, self.d_convs, self.q_context_convs, self.d_context_convs):
            q_out = q_conv(embed_query).transpose(
                1, 2)  # to shape (B, D, L) => (B, F, L) => (B, L, F)
            d_out = d_conv(embed_doc).transpose(
                1, 2)  # to shape (B, D, R) => (B, F, R) => (B, R, F)
            q_out = F.normalize(q_out, p=2,
                                dim=-1)  # good stuff for relevance matching
            d_out = F.normalize(d_out, p=2, dim=-1)
            q_convs.append(q_out)
            d_convs.append(d_out)

            q_ctx_out = q_context_conv(query_char_repr).transpose(1,
                                                                  2)  # B, L, F
            d_ctx_out = d_context_conv(doc_char_repr).transpose(1,
                                                                2)  # B, R, F
            q_ctx_out = F.normalize(q_ctx_out, p=2, dim=-1)
            d_ctx_out = F.normalize(d_ctx_out, p=2, dim=-1)
            q_ctx_convs.append(q_ctx_out)
            d_ctx_convs.append(d_ctx_out)

        output_phis = []
        for idx in range(self.max_ngram):
            query_local_context = q_ctx_convs[idx]  # (B, L, D)
            doc_local_context = d_ctx_convs[idx]  # (B, R, D)
            sim_mat = self._get_sim_matrix(q_convs[idx], d_convs[idx])
            sim_mat = sim_mat * tensor_mask

            if self.attention_type == AttentionType.UsingDotProductOnly:
                # using sim_mat, context_mat, sim_mat - context_mat, sim_mat * context_mat
                # [S, L, S - L, S * L]
                context_aware_mat = self._get_sim_matrix(
                    query_local_context, doc_local_context) * tensor_mask
                tensors = torch.stack([
                    sim_mat, context_aware_mat, sim_mat - context_aware_mat,
                    sim_mat * context_aware_mat
                ],
                                      dim=-1)  # B, L, R, C
            elif self.attention_type == AttentionType.UsingDotProductDisim:
                # using sim_mat, context_mat, sim_mat - context_mat, dissimilarity * sim_mat
                # [S, L, S - L, S * D]
                context_aware_mat = self._get_sim_matrix(
                    query_local_context, doc_local_context) * tensor_mask
                dissimilarity = self._get_disimilarity_mat(
                    query_local_context, doc_local_context, tensor_mask,
                    self.use_average_dcompositional_att) * tensor_mask
                tensors = torch.stack([
                    sim_mat, context_aware_mat, sim_mat - context_aware_mat,
                    sim_mat * dissimilarity
                ],
                                      dim=-1)  # B, L, R, C
            elif self.attention_type == AttentionType.UsingBilinearOnly:
                # [S, B, S - B, S * B]
                bilinear = self._get_bilinear_attention(
                    query_local_context, doc_local_context) * tensor_mask
                tensors = torch.stack([
                    sim_mat, bilinear, sim_mat - bilinear, bilinear * sim_mat
                ],
                                      dim=-1)  # B, L, R, C
            elif self.attention_type == AttentionType.UsingBilinearDissim:
                # [S, B, S - B, S * D]
                bilinear = self._get_bilinear_attention(
                    query_local_context, doc_local_context) * tensor_mask
                dissimilarity = self._get_disimilarity_mat(
                    query_local_context, doc_local_context, tensor_mask,
                    self.use_average_dcompositional_att) * tensor_mask
                tensors = torch.stack([
                    sim_mat, bilinear, sim_mat - bilinear,
                    dissimilarity * sim_mat
                ],
                                      dim=-1)  # B, L, R, C

            tensors = tensors.permute(0, 3, 1, 2)  # (B, C, L, R)
            phi = torch.flatten(self.head_conv_layers[0](tensors), start_dim=1)
            output_phis.append(phi)

        phi = torch.cat(output_phis, dim=-1)  # (B, x)
        if self.use_visual:
            # a list of size B, where each element is a list of image tensors
            t1 = time.time()
            query_images_indices = kargs[KeyWordSettings.QueryImagesIndices]
            B1, n1, M1 = query_images_indices.shape  # expected shape
            assert n1 == 1
            query_images = self.full_left_images_tensor[
                query_images_indices.flatten().long()]  # B1 * n1 * M1, VD
            doc_imgs_indices = kargs[
                KeyWordSettings.
                DocImagesIndices]  # (B, n, M2, VD) or (B, M2, VD)
            B, n, M2 = doc_imgs_indices.shape  # expected shape
            images_mask = torch_utils.create_mask_tensor_image(
                query_images_indices, doc_imgs_indices)  # (B, n, M1, M2)
            doc_images = self.full_right_images_tensor[
                doc_imgs_indices.flatten().long()]  # B * n * M2, VD

            left_feats = self.image_fc1(
                query_images
            )  # (B * n1 * M1, H) we don't want visual_cnn on 30 duplicated queries images (not wise)
            right_feats = self.image_fc1(doc_images)  # (B * n * M2, H)
            left_feats = left_feats.view(B1, M1, self.last_visual_size)
            if B1 == 1:
                left_feats = left_feats.expand(
                    B, M1, self.last_visual_size)  # during testing
            right_feats = right_feats.view(B, n * M2, self.last_visual_size)
            right_feats = F.normalize(right_feats, p=2, dim=-1)
            left_feats = F.normalize(left_feats, p=2, dim=-1)
            scores = torch.bmm(left_feats,
                               right_feats.permute(0, 2, 1))  # (B, M1, n * M2)
            scores = scores.view(B, M1, n, M2).permute(0, 2, 1,
                                                       3)  # (B, n, M1, M2)
            # masking
            assert scores.size() == images_mask.size(), (scores.size(),
                                                         images_mask.size())
            scores = scores * images_mask
            scores = scores.view(B * n, M1, M2)
            visual_scores, _ = torch.flatten(scores, start_dim=1).max(-1)
            visual_scores = visual_scores.unsqueeze(-1)  # (B * n, 1)
            phi = torch.cat([phi, visual_scores], dim=-1)
            t2 = time.time()
            # print("Running time of CNN in forward: ", (t2 - t1), "seconds")
        out = self.linear(phi)
        if verbose:
            print("out: ", out.squeeze())
            # print("After dense and tanh: ", out)
        if KeyWordSettings.OutputRankingKey in kargs and kargs[
                KeyWordSettings.OutputRankingKey] and self.use_visual:
            return torch.cat([out, torch.flatten(scores, start_dim=1)],
                             dim=-1)  # for error analysis (B, 2)
        return out.squeeze()
Пример #23
0
    def fit(
            self,
            train_iteractions: interactions.MatchInteraction,
            verbose=True,  # for printing out evaluation during training
            topN=10,
            val_interactions: interactions.MatchInteraction = None,
            test_interactions: interactions.MatchInteraction = None):
        """
        Fit the model.
        Parameters
        ----------
        train_iteractions: :class:`matchzoo.DataPack` The input sequence dataset.
        val_interactions: :class:`matchzoo.DataPack`
        test_interactions: :class:`matchzoo.DataPack`
        """
        self._initialize(train_iteractions)
        best_hit, best_ndcg, best_epoch, test_ndcg, test_hit = 0, 0, 0, 0, 0
        test_results_dict = None
        iteration_counter = 0
        count_patience_epochs = 0

        for epoch_num in range(self._n_iter):

            # ------ Move to here ----------------------------------- #
            self._net.train(True)
            query_ids, left_contents, left_lengths, \
            doc_ids, right_contents, right_lengths, \
            neg_docs_ids, neg_docs_contents, neg_docs_lens = self._sampler.get_train_instances(train_iteractions, self._num_negative_samples)

            queries, query_content, query_lengths, \
            docs, doc_content, doc_lengths, \
            neg_docs, neg_docs_contents, neg_docs_lens = my_utils.shuffle(query_ids, left_contents, left_lengths,
                                                                doc_ids, right_contents, right_lengths,
                                                                neg_docs_ids, neg_docs_contents, neg_docs_lens)
            epoch_loss, total_pairs = 0.0, 0
            t1 = time.time()
            for (minibatch_num,
                (batch_query, batch_query_content, batch_query_len,
                 batch_doc, batch_doc_content, batch_docs_lens,
                 batch_neg_docs, batch_neg_doc_content, batch_neg_docs_lens)) \
                    in enumerate(my_utils.minibatch(queries, query_content, query_lengths,
                                                    docs, doc_content, doc_lengths,
                                                    neg_docs, neg_docs_contents, neg_docs_lens,
                                                    batch_size = self._batch_size)):
                # add idf here...
                query_idfs = None
                if len(TFIDF.get_term_idf()) != 0:
                    query_idf_dict = TFIDF.get_term_idf()
                    query_idfs = [[
                        query_idf_dict.get(int(word_idx), 0.0)
                        for word_idx in row
                    ] for row in batch_query_content]
                    query_idfs = torch_utils.gpu(
                        torch.from_numpy(np.array(query_idfs)).float(),
                        self._use_cuda)

                batch_query = my_utils.gpu(torch.from_numpy(batch_query),
                                           self._use_cuda)
                batch_query_content = my_utils.gpu(
                    torch.from_numpy(batch_query_content), self._use_cuda)
                batch_doc = my_utils.gpu(torch.from_numpy(batch_doc),
                                         self._use_cuda)
                batch_doc_content = my_utils.gpu(
                    torch.from_numpy(batch_doc_content), self._use_cuda)
                batch_neg_doc_content = my_utils.gpu(
                    torch.from_numpy(batch_neg_doc_content), self._use_cuda)
                total_pairs += self._batch_size * self._num_negative_samples

                self._optimizer.zero_grad()
                if self._loss in ["bpr", "hinge", "pce", "bce"]:
                    loss = self._get_multiple_negative_predictions_normal(
                        batch_query,
                        batch_query_content,
                        batch_doc,
                        batch_doc_content,
                        batch_neg_docs,
                        batch_neg_doc_content,
                        batch_query_len,
                        batch_docs_lens,
                        batch_neg_docs_lens,
                        self._num_negative_samples,
                        query_idf=query_idfs)
                epoch_loss += loss.item()
                iteration_counter += 1
                # if iteration_counter % 2 == 0: break
                TensorboardWrapper.mywriter().add_scalar(
                    "loss/minibatch_loss", loss.item(), iteration_counter)
                loss.backward()
                self._optimizer.step()
            epoch_loss /= float(total_pairs)
            TensorboardWrapper.mywriter().add_scalar("loss/epoch_loss_avg",
                                                     epoch_loss, epoch_num)
            # print("Number of Minibatches: ", minibatch_num, "Avg. loss of epoch: ", epoch_loss)
            t2 = time.time()
            epoch_train_time = t2 - t1
            if verbose:  # validation after each epoch
                t1 = time.time()
                assert len(val_interactions.unique_queries_test
                           ) in KeyWordSettings.QueryCountVal, len(
                               val_interactions.unique_queries_test)
                result_val = self.evaluate(val_interactions, topN)
                hits = result_val["hits"]
                ndcg = result_val["ndcg"]
                t2 = time.time()
                valiation_time = t2 - t1

                if epoch_num and epoch_num % self._testing_epochs == 0:
                    t1 = time.time()
                    assert len(test_interactions.unique_queries_test
                               ) in KeyWordSettings.QueryCountTest
                    result_test = self.evaluate(test_interactions, topN)
                    hits_test = result_test["hits"]
                    ndcg_test = result_test["ndcg"]
                    t2 = time.time()
                    testing_time = t2 - t1
                    TensorboardWrapper.mywriter().add_scalar(
                        "hit/hit_test", hits_test, epoch_num)
                    TensorboardWrapper.mywriter().add_scalar(
                        "ndcg/ndcg_test", ndcg_test, epoch_num)
                    FileHandler.myprint(
                        '|Epoch %03d | Test hits@%d = %.5f | Test ndcg@%d = %.5f | Testing time: %04.1f(s)'
                        % (epoch_num, topN, hits_test, topN, ndcg_test,
                           testing_time))

                TensorboardWrapper.mywriter().add_scalar(
                    "hit/hits_val", hits, epoch_num)
                TensorboardWrapper.mywriter().add_scalar(
                    "ndcg/ndcg_val", ndcg, epoch_num)
                FileHandler.myprint(
                    '|Epoch %03d | Train time: %04.1f(s) | Train loss: %.3f'
                    '| Vad hits@%d = %.5f | Vad ndcg@%d = %.5f | Validation time: %04.1f(s)'
                    % (epoch_num, epoch_train_time, epoch_loss, topN, hits,
                       topN, ndcg, valiation_time))

                if hits > best_hit or (hits == best_hit and ndcg > best_ndcg):
                    # if (hits + ndcg) > (best_hit + best_ndcg):
                    count_patience_epochs = 0
                    with open(self.saved_model, "wb") as f:
                        torch.save(self._net.state_dict(), f)
                    # test_results_dict = result_test
                    best_hit, best_ndcg, best_epoch = hits, ndcg, epoch_num
                    # test_hit, test_ndcg = hits_test, ndcg_test
                else:
                    count_patience_epochs += 1
                if self._early_stopping_patience and count_patience_epochs > self._early_stopping_patience:
                    FileHandler.myprint(
                        "Early Stopped due to no better performance in %s epochs"
                        % count_patience_epochs)
                    break

            if np.isnan(epoch_loss) or epoch_loss == 0.0:
                raise ValueError(
                    'Degenerate epoch loss: {}'.format(epoch_loss))
        FileHandler.myprint("Closing tensorboard")
        TensorboardWrapper.mywriter().close()
        FileHandler.myprint(
            'Best result: | vad hits@%d = %.5f | vad ndcg@%d = %.5f | epoch = %d'
            % (topN, best_hit, topN, best_ndcg, best_epoch))
        FileHandler.myprint_details(
            json.dumps(test_results_dict, sort_keys=True, indent=2))
Пример #24
0
    def evaluate(self,
                 testRatings: interactions.MatchInteraction,
                 K: int,
                 output_ranking=False,
                 **kargs):
        """
        I decided to move this function into Fitter class since different models have different ways to evaluate (i.e.
        different data sources to use). Therefore, it is needed to have seperate evaluation methods in each Fitter
        class. Furthermore, I notice that this function uses _use_cuda which is a property of Fitter class.
        Parameters
        ----------
        testRatings
        K
        output_ranking
        kargs

        Returns
        -------

        """
        ndcg_metric = normalized_discounted_cumulative_gain.NormalizedDiscountedCumulativeGain

        hits, ndcgs = [], []
        ndcgs_at_1 = []
        list_error_analysis = []
        for query, candidates in testRatings.unique_queries_test.items():
            docs, labels, doc_contents, _ = candidates
            query_content = testRatings.dict_query_contents[query]
            query_len = [testRatings.dict_query_lengths[query]] * len(labels)
            doc_lens = [testRatings.dict_doc_lengths[d] for d in docs]

            query_idfs = None
            if len(TFIDF.get_term_idf()) > 0:
                query_idf_dict = TFIDF.get_term_idf()
                query_idfs = [
                    query_idf_dict.get(int(word_idx), 0.0)
                    for word_idx in query_content
                ]
                query_idfs = np.tile(query_idfs, (len(labels), 1))
                query_idfs = my_utils.gpu(
                    torch.from_numpy(np.array(query_idfs)).float(),
                    self._use_cuda)

            query_content = np.tile(
                query_content,
                (len(labels), 1))  # len(labels), query_contnt_leng)
            doc_contents = np.array(doc_contents)
            query_content = my_utils.gpu(query_content)
            doc_contents = my_utils.gpu(doc_contents)

            query_content = my_utils.gpu(
                my_utils.numpy2tensor(query_content, dtype=torch.int),
                self._use_cuda)
            doc_contents = my_utils.gpu(
                my_utils.numpy2tensor(doc_contents, dtype=torch.int),
                self._use_cuda)

            predictions = self._net.predict(query_content,
                                            doc_contents,
                                            query_lens=query_len,
                                            docs_lens=doc_lens,
                                            query_idf=query_idfs)
            ndcg_mz = ndcg_metric(K)(labels, predictions)
            ndcgs_at_1.append(ndcg_metric(1)(labels, predictions))
            ndcgs.append(ndcg_mz)
            positive_docs = set(
                [d for d, lab in zip(docs, labels) if lab == 1])
            indices = np.argsort(
                -predictions)[:K]  # indices of items with highest scores
            docs = np.array(docs)
            ranked_docs = docs[indices]
            if output_ranking:
                labels = np.array(labels)
                ranked_labels = labels[indices]
                scores = predictions[indices]
                assert scores.shape == ranked_labels.shape
                ranked_doc_list = [{
                    KeyWordSettings.Doc_cID:
                    int(d),
                    KeyWordSettings.Doc_cLabel:
                    int(lab),
                    KeyWordSettings.Doc_wImages: [],
                    KeyWordSettings.Doc_wContent:
                    testRatings.dict_doc_raw_contents[d],
                    KeyWordSettings.Relevant_Score:
                    float(score)
                } for d, lab, score in zip(ranked_docs, ranked_labels, scores)]

                q_details = {
                    KeyWordSettings.Query_id:
                    int(query),
                    KeyWordSettings.Query_Images: [],
                    KeyWordSettings.Ranked_Docs:
                    ranked_doc_list,
                    KeyWordSettings.Query_Content:
                    testRatings.dict_query_raw_contents[query]
                }
                list_error_analysis.append(q_details)

            hit = my_evaluator.getHitRatioForList(ranked_docs, positive_docs)
            # ndcg_mine = getNDCGForList(ranklist, positive_docs)
            hits.append(hit)
            # assert abs(ndcg_mine - ndcg_mz) < 1e-10, (ndcg_mine, ndcg_mz)

        results = {}
        results["ndcg"] = np.nanmean(ndcgs)
        results["ndcg_list"] = ndcgs
        results["hits"] = np.nanmean(hits)
        results["hits_list"] = hits
        results["ndcg@1"] = np.nanmean(ndcgs_at_1)
        results["ndcg@1_list"] = ndcgs_at_1

        if output_ranking:
            return results, sorted(list_error_analysis, key=lambda x: x["qid"])
        return results
Пример #25
0
    def fit(self, interactions,
            verbose=True,
            topN = 10,
            vadRatings = None,
            vadNegatives = None,
            testRatings = None,
            testNegatives = None,
            adjNetwork = None,
            user_user_sppmi = None,
            item_item_sppmi = None,
            user_user_sim = None,
            item_item_sim = None,
            alpha_gau: float = None, gamma_gau: float = None, beta_gau: float = None):
        """
        Fit the model.
        Parameters
        ----------

        interactions: :class:`interactions.Interactions`
            The input sequence dataset.
        vadRatings: :class:`list[list[int]]`
        vadNegatives: :class:`list[list[int]]`
        testRatings: :class:`list[list[int]]`
        testNegatives: :class:`list[list[int]]`
            Negative samples of every pair of (user, item) in  testRatings. shape (bs, 100)
            100 negative samples
        """

        self._sampler.set_interactions(interactions)
        if not self._initialized():
            self._initialize(interactions)

        best_map, best_ndcg, best_epoch, test_ndcg, test_map = 0, 0, 0, 0, 0
        test_results_dict = None

        for epoch_num in range(self._n_iter):
            user_ids, item_ids, neg_items_ids = self._sampler.get_train_instances(interactions, self._num_negative_samples, random_state = self._random_state)
            self._check_input(user_ids, item_ids)
            users, items, neg_items = my_utils.shuffle(user_ids, item_ids, neg_items_ids, random_state = self._random_state)

            user_ids_tensor = my_utils.gpu(torch.from_numpy(users), self._use_cuda)
            item_ids_tensor = my_utils.gpu(torch.from_numpy(items), self._use_cuda)
            neg_item_ids_tensor = my_utils.gpu(torch.from_numpy(neg_items), self._use_cuda)
            self._check_shape(user_ids_tensor, item_ids_tensor, neg_item_ids_tensor, self._num_negative_samples)
            epoch_loss = 0.0
            t1 = time.time()
            visited_users = set()
            visited_users_sppmi = set()
            visited_item_sppmi = set()
            visited_user_sim = set()
            visited_item_sim = set()

            for (minibatch_num,
                 (batch_user, batch_item, batch_negatives)) in enumerate(my_utils.minibatch(user_ids_tensor, item_ids_tensor, neg_item_ids_tensor,
                                                                    batch_size = self._batch_size)):

                # need to duplicate batch_user and batch_item
                network = self._prepare_network_input(batch_user, visited_users, adjNetwork)
                user_user_sppmi_selected = self._select_user_user_sppmi_input(batch_user, visited_users_sppmi, user_user_sppmi)
                item_item_sppmi_selected = self._select_user_user_sppmi_input(batch_item, visited_item_sppmi, item_item_sppmi)
                user_user_sim_selected = self._select_user_user_sppmi_input(batch_item, visited_user_sim, user_user_sim)
                item_item_sim_selected = self._select_user_user_sppmi_input(batch_item, visited_item_sim, item_item_sim)
                self._optimizer.zero_grad()
                loss = self._get_loss(batch_user, batch_item, network, user_user_sppmi_selected,
                                      item_item_sppmi_selected, user_user_sim_selected, item_item_sim_selected, alpha_gau, gamma_gau, beta_gau)

                epoch_loss += loss.item()
                loss.backward()
                self._optimizer.step()

            epoch_loss /= minibatch_num + 1
            t2 = time.time()
            epoch_train_time = t2 - t1
            if verbose:  # validation after each epoch
                t1 = time.time()
                result_val = self.evaluate(vadRatings, vadNegatives, topN)
                mapks = result_val["map"]
                ndcg = result_val["ndcg"]
                recall = result_val["recall"]

                result_test = self.evaluate(testRatings, testNegatives, topN)
                maps_test = result_test["map"]
                ndcg_test = result_test["ndcg"]
                recall_test = result_test["recall"]

                t2 = time.time()
                eval_time = t2 - t1
                self.output_handler.myprint('|Epoch %d | Train time: %d (s) | Train loss: %.5f | Eval time: %.3f (s) '
                      '| Vad mapks@%d = %.5f | Vad ndcg@%d = %.5f | Vad recall@%d = %.5f '
                      '| Test mapks@%d = %.5f | Test ndcg@%d = %.5f | Test recall@%d = %.5f'
                      % (epoch_num, epoch_train_time, epoch_loss, eval_time, topN, mapks, topN, ndcg,
                         topN, recall, topN, maps_test, topN, ndcg_test, topN, recall_test))
                if ndcg > best_ndcg:
                    with open(self.saved_model, "wb") as f:
                        torch.save(self._net, f)
                    test_results_dict = result_test
                    best_map, best_ndcg, best_epoch = mapks, ndcg, epoch_num
                    test_map, test_ndcg = maps_test, ndcg_test

            if np.isnan(epoch_loss) or epoch_loss == 0.0:
                raise ValueError('Degenerate epoch loss: {}'.format(epoch_loss))

        self.output_handler.myprint('Best result: '
              '| vad precisions@%d = %.3f | vad ndcg@%d = %.3f '
              '| test precisions@%d = %.3f | test ndcg@%d = %.3f | epoch = %d' % (topN, best_map, topN, best_ndcg,
                                                                            topN, test_map, topN, test_ndcg, best_epoch))
        self.output_handler.myprint_details(json.dumps(test_results_dict, sort_keys = True, indent = 2))
Пример #26
0
def fit_models(args):
    if not os.path.exists(args.log):
        os.mkdir(args.log)

    curr_date = datetime.datetime.now().timestamp()  # seconds
    # folder to store all outputed files of a run
    secondary_log_folder = os.path.join(args.log,
                                        "log_results_%s" % (int(curr_date)))
    if not os.path.exists(secondary_log_folder):
        os.mkdir(secondary_log_folder)

    logfolder_result = os.path.join(secondary_log_folder,
                                    "%s_result.txt" % int(curr_date))
    FileHandler.init_log_files(logfolder_result)
    settings = json.dumps(vars(args), sort_keys=True, indent=2)
    FileHandler.myprint("Running script " + str(os.path.realpath(__file__)))
    FileHandler.myprint(settings)
    FileHandler.myprint("Setting seed to " + str(args.seed))

    seed = args.seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

    if args.cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    index2queries = dict(
        (y, x) for x, y in json.loads(open(args.query_mapped).read()).items())
    index2docs = dict(
        (y, x)
        for x, y in json.loads(open(args.article_mapped).read()).items())
    root = args.path
    use_reranking = "reranking" in root
    t1 = time.time()

    elmo_queries_path = os.path.join(args.elmo_feats, "queries_feats.pth")
    elmo_docs_path = os.path.join(args.elmo_feats, "articles_feats.pth")
    elmo_loader = load_data.ElmoLoader(elmo_queries_path, elmo_docs_path,
                                       args.fixed_length_left,
                                       args.fixed_length_right)
    load_data_func = elmo_loader.elmo_load_data

    train_pack = load_data_func(root, 'train', prefix=args.dataset)
    valid_pack = load_data_func(root, 'dev', prefix=args.dataset)
    predict_pack = load_data_func(root, 'test', prefix=args.dataset)
    if use_reranking:
        FileHandler.myprint("Using Re-Ranking Dataset..........")
        predict2_hard_pack = load_data_func(root,
                                            'test2_hard',
                                            prefix=args.dataset)

    a = train_pack.left["text_left"].str.lower().str.split().apply(len).max()
    b = valid_pack.left["text_left"].str.lower().str.split().apply(len).max()
    c = predict_pack.left["text_left"].str.lower().str.split().apply(len).max()
    max_query_length = max([a, b, c])
    min_query_length = min([a, b, c])

    a = train_pack.right["text_right"].str.lower().str.split().apply(len).max()
    b = valid_pack.right["text_right"].str.lower().str.split().apply(len).max()
    c = predict_pack.right["text_right"].str.lower().str.split().apply(
        len).max()
    max_doc_length = max([a, b, c])
    min_doc_length = min([a, b, c])

    FileHandler.myprint("Min query length, " + str(min_query_length) +
                        " Min doc length " + str(min_doc_length))
    FileHandler.myprint("Max query length, " + str(max_query_length) +
                        " Max doc length " + str(max_doc_length))

    if args.use_visual:
        image_loader = load_data.ImagesLoader(
            left_pth_file=args.left_images_features,
            max_num_left_images=args.n_img_in_query,
            right_pth_file=args.right_images_features,
            max_num_right_images=args.n_img_in_doc,
            use_cuda=args.cuda)
        data_packs = [train_pack, valid_pack, predict_pack]
        if use_reranking:
            data_packs.append(predict2_hard_pack)

        image_loader.fit(data_packs)  # memory-intensive (~10Gb RAM)
        train_pack = image_loader.transform(train_pack)
        valid_pack = image_loader.transform(valid_pack)
        predict_pack = image_loader.transform(predict_pack)
        if use_reranking:
            predict2_hard_pack = image_loader.transform(predict2_hard_pack)

        print(image_loader.left_tensor.size(),
              image_loader.right_tensor.size())

    preprocessor = mz.preprocessors.ElmoPreprocessor(args.fixed_length_left,
                                                     args.fixed_length_right)
    print('parsing data')
    train_processed = preprocessor.fit_transform(
        train_pack)  # This is a DataPack
    valid_processed = preprocessor.transform(valid_pack)
    predict_processed = preprocessor.transform(predict_pack)

    train_interactions = MatchInteractionVisual(train_processed)
    valid_interactions = MatchInteractionVisual(valid_processed)
    test_interactions = MatchInteractionVisual(predict_processed)
    if use_reranking:
        predict2_processed = preprocessor.transform(predict2_hard_pack)
        predict2_interactions = MatchInteractionVisual(predict2_processed)

    FileHandler.myprint('done extracting')
    t2 = time.time()
    FileHandler.myprint('loading data time: %d (seconds)' % (t2 - t1))
    FileHandler.myprint("Building model")

    print("Loading word embeddings......")
    t1_emb = time.time()
    term_index = preprocessor.context['vocab_unit'].state['term_index']
    glove_embedding = mz.datasets.embeddings.load_glove_embedding(
        dimension=args.word_embedding_size, term_index=term_index)

    embedding_matrix = glove_embedding.build_matrix(term_index)
    l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
    embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]
    t2_emb = time.time()
    print("Time to load word embeddings......", (t2_emb - t1_emb))

    match_params = {}
    match_params['embedding'] = embedding_matrix
    match_params["embedding_freeze"] = True  # freezing word embeddings
    match_params["fixed_length_left"] = args.fixed_length_left
    match_params["fixed_length_right"] = args.fixed_length_right
    match_params['dropout'] = 0.1
    match_params['filters'] = args.filters
    match_params["conv_layers"] = args.conv_layers
    match_params["filters_count_pacrr"] = args.filters_count_pacrr
    match_params["n_s"] = args.n_s
    match_params["max_ngram"] = args.max_ngram
    match_params["head_cnn_type"] = args.head_cnn_type
    match_params["use_visual"] = args.use_visual
    match_params[
        "use_average_dcompositional_att"] = args.use_average_dcompositional_att
    match_params["attention_type"] = args.attention_type
    # contextualized part
    match_params["left_elmo_tensor"] = elmo_loader.left_tensor_feats
    match_params["right_elmo_tensor"] = elmo_loader.right_tensor_feats
    match_params["elmo_vec_size"] = 1024

    if args.use_visual:
        match_params["visual_feature_size"] = image_loader.visual_features_size
        image_loader.left_tensor = torch_utils.gpu(image_loader.left_tensor,
                                                   args.cuda)
        image_loader.right_tensor = torch_utils.gpu(image_loader.right_tensor,
                                                    args.cuda)
        match_params["full_left_images_tensor"] = image_loader.left_tensor
        match_params["full_right_images_tensor"] = image_loader.right_tensor

    match_model = multimodal_attention_network.MultiModalAttentionNetwork(
        match_params)
    FileHandler.myprint("Fitting Model")
    if args.use_visual:
        FileHandler.myprint("Using both Textual and Visual features.......")
        fit_model = fitter.VisualFitter(net=match_model,
                                        loss=args.loss_type,
                                        n_iter=args.epochs,
                                        batch_size=args.batch_size,
                                        learning_rate=args.lr,
                                        early_stopping=args.early_stopping,
                                        use_cuda=args.cuda,
                                        num_negative_samples=args.num_neg,
                                        logfolder=secondary_log_folder,
                                        curr_date=curr_date,
                                        use_visual=args.use_visual,
                                        image_loader=image_loader,
                                        index2queries=index2queries,
                                        index2docs=index2docs)
    else:
        FileHandler.myprint("Using Textual content only....")
        fit_model = contextualized_fitter.ContextualizedFitter(
            net=match_model,
            loss=args.loss_type,
            n_iter=args.epochs,
            batch_size=args.batch_size,
            learning_rate=args.lr,
            early_stopping=args.early_stopping,
            use_cuda=args.cuda,
            num_negative_samples=args.num_neg,
            logfolder=secondary_log_folder,
            curr_date=curr_date)

    try:
        fit_model.fit(train_interactions,
                      verbose=True,
                      topN=args.topk,
                      val_interactions=valid_interactions,
                      test_interactions=test_interactions)
        fit_model.load_best_model(valid_interactions,
                                  test_interactions,
                                  topN=args.topk)
        if use_reranking:
            fit_model.load_best_model_test2_test3(predict2_interactions,
                                                  None,
                                                  topN=args.topk)

    except KeyboardInterrupt:
        FileHandler.myprint('Exiting from training early')
    t10 = time.time()
    FileHandler.myprint('Total time:  %d (seconds)' % (t10 - t1))
    def fit(
            self,
            train_iteractions: interactions.MatchInteraction,
            verbose=True,  # for printing out evaluation during training
            val_interactions: interactions.MatchInteraction = None,
            test_interactions: interactions.MatchInteraction = None):
        """
        Fit the model.
        Parameters
        ----------
        train_iteractions: :class:`matchzoo.DataPack` The input sequence dataset.
        val_interactions: :class:`matchzoo.DataPack`
        test_interactions: :class:`matchzoo.DataPack`
        """
        self._initialize()
        best_ce, best_epoch, test_ce = sys.maxsize, 0, 0
        test_results_dict = None
        iteration_counter = 0
        count_patience_epochs = 0

        for epoch_num in range(self._n_iter):
            # ------ Move to here ----------------------------------- #
            self._net.train(True)
            query_ids, left_contents, left_lengths, \
            doc_ids, right_contents, target_contents, right_lengths = self._sampler.get_instances(train_iteractions)

            queries, query_content, query_lengths, \
            docs, doc_content, target_contents, doc_lengths = my_utils.shuffle(query_ids, left_contents, left_lengths,
                                                              doc_ids, right_contents, target_contents, right_lengths)
            epoch_loss, total_pairs = 0.0, 0
            t1 = time.time()
            for (minibatch_num, (batch_query, batch_query_content, batch_query_len,
                 batch_doc, batch_doc_content, batch_doc_target, batch_docs_lens)) \
                    in enumerate(my_utils.minibatch(queries, query_content, query_lengths,
                                                    docs, doc_content, target_contents, doc_lengths,
                                                    batch_size = self._batch_size)):
                t3 = time.time()
                batch_query = my_utils.gpu(torch.from_numpy(batch_query),
                                           self._use_cuda)
                batch_query_content = my_utils.gpu(
                    torch.from_numpy(batch_query_content), self._use_cuda)
                # batch_query_len = my_utils.gpu(torch.from_numpy(batch_query_len), self._use_cuda)
                batch_doc = my_utils.gpu(torch.from_numpy(batch_doc),
                                         self._use_cuda)
                batch_doc_content = my_utils.gpu(
                    torch.from_numpy(batch_doc_content), self._use_cuda)
                batch_doc_target = my_utils.gpu(
                    torch.from_numpy(batch_doc_target), self._use_cuda)
                # batch_docs_lens = my_utils.gpu(torch.from_numpy(batch_docs_lens), self._use_cuda)

                total_pairs += batch_query.size(0)  # (batch_size)
                self._optimizer.zero_grad()
                loss = self._get_loss(batch_query, batch_query_content,
                                      batch_doc, batch_doc_content,
                                      batch_query_len, batch_docs_lens,
                                      batch_doc_target)
                epoch_loss += loss.item()
                iteration_counter += 1
                # if iteration_counter % 2 == 0: break
                TensorboardWrapper.mywriter().add_scalar(
                    "loss/minibatch_loss", loss.item(), iteration_counter)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self._net.parameters(),
                                               self._clip)
                self._optimizer.step()
                t4 = time.time()
                # if iteration_counter % 100 == 0: print("Running time for each mini-batch: ", (t4 - t3), "s")
            epoch_loss /= float(total_pairs)
            TensorboardWrapper.mywriter().add_scalar("loss/epoch_loss_avg",
                                                     epoch_loss, epoch_num)
            # print("Number of Minibatches: ", minibatch_num, "Avg. loss of epoch: ", epoch_loss)
            t2 = time.time()
            epoch_train_time = t2 - t1
            if verbose:  # validation after each epoch
                t1 = time.time()
                result_val = self.evaluate(val_interactions)
                val_ce = result_val["cross_entropy"]
                t2 = time.time()
                validation_time = t2 - t1

                TensorboardWrapper.mywriter().add_scalar(
                    "cross_entropy/val_ce", val_ce, epoch_num)
                FileHandler.myprint(
                    '|Epoch %03d | Train time: %04.1f(s) | Train loss: %.3f'
                    '| Val loss = %.5f | Validation time: %04.1f(s)' %
                    (epoch_num, epoch_train_time, epoch_loss, val_ce,
                     validation_time))

                if val_ce < best_ce:
                    count_patience_epochs = 0
                    with open(self.saved_model, "wb") as f:
                        torch.save(self._net.state_dict(), f)
                    # test_results_dict = result_test
                    best_ce, best_epoch = val_ce, epoch_num
                else:
                    count_patience_epochs += 1
                if self._early_stopping_patience and count_patience_epochs > self._early_stopping_patience:
                    FileHandler.myprint(
                        "Early Stopped due to no better performance in %s epochs"
                        % count_patience_epochs)
                    break

            if np.isnan(epoch_loss) or epoch_loss == 0.0:
                raise ValueError(
                    'Degenerate epoch loss: {}'.format(epoch_loss))
        FileHandler.myprint("Closing tensorboard")
        TensorboardWrapper.mywriter().close()
        FileHandler.myprint(
            'Best result: | vad cross_entropy = %.5f | epoch = %d' %
            (best_ce, best_epoch))
        FileHandler.myprint_details(
            json.dumps(test_results_dict, sort_keys=True, indent=2))
 def _initialize(self):
     # put the model into cuda if use cuda
     self._net = my_utils.gpu(self._net, self._use_cuda)
     self._optimizer = optim.Adam(self._net.parameters(),
                                  lr=self._learning_rate)
Пример #29
0
    def evaluate(self, testRatings: interactions.MatchInteractionVisual, K: int, output_ranking = False, **kargs):
        """
        I decided to move this function into Fitter class since different models have different ways to evaluate (i.e.
        different data sources to use). Therefore, it is needed to have seperate evaluation methods in each Fitter
        class. Furthermore, I notice that this function uses _use_cuda which is a property of Fitter class.
        Parameters
        ----------
        testRatings
        K
        output_ranking
        kargs

        Returns
        -------

        """
        ndcg_metric = normalized_discounted_cumulative_gain.NormalizedDiscountedCumulativeGain

        hits, ndcgs = [], []
        ndcgs_at_1 = []
        list_error_analysis = []
        for query, candidates in tqdm(testRatings.unique_queries_test.items()):
            t3 = time.time()
            docs, labels, doc_contents, _ = candidates
            query_content = testRatings.dict_query_contents[query]
            query_images_indices = testRatings.dict_query_imgages[query]
            query_len = [testRatings.dict_query_lengths[query]] * len(labels)
            doc_lens = [testRatings.dict_doc_lengths[d] for d in docs]
            doc_images_indices = [testRatings.dict_doc_imgages[d] for d in docs]

            additional_data = {}
            additional_data[KeyWordSettings.Query_lens] = query_len
            additional_data[KeyWordSettings.Doc_lens] = doc_lens
            if len(TFIDF.get_term_idf()) > 0:
                query_idf_dict = TFIDF.get_term_idf()
                query_idfs = [query_idf_dict.get(int(word_idx), 0.0) for word_idx in query_content]
                query_idfs = np.tile(query_idfs, (len(labels), 1))
                query_idfs = my_utils.gpu(torch.from_numpy(np.array(query_idfs)).float(), self._use_cuda)
                additional_data[KeyWordSettings.Query_Idf] = query_idfs
            if self.use_visual:
                t1 = time.time()
                query_images_indices = np.array(query_images_indices)
                assert query_images_indices.shape == (len(query_images_indices), )
                query_images = query_images_indices.reshape(1, 1, len(query_images_indices))
                doc_images = np.array(doc_images_indices)
                query_images = torch_utils.gpu(torch.from_numpy(query_images), self._use_cuda)
                doc_images = torch_utils.gpu(torch.from_numpy(doc_images), self._use_cuda)
                additional_data[KeyWordSettings.QueryImagesIndices] = query_images  # (1, 1, M1)
                additional_data[KeyWordSettings.DocImagesIndices] = doc_images.unsqueeze(1)  # (B, 1, M2)
                t2 = time.time()
                # print("Loading time images to gpu of validation: ", t2 - t1, "seconds")
            if output_ranking: additional_data[KeyWordSettings.OutputRankingKey] = True  # for error analysis
            query_content = np.tile(query_content, (len(labels), 1))  # len(labels), query_contnt_leng)
            doc_contents = np.array(doc_contents)
            query_content = my_utils.gpu(query_content)
            doc_contents = my_utils.gpu(doc_contents)

            query_content = my_utils.gpu(my_utils.numpy2tensor(query_content, dtype=torch.int), self._use_cuda)
            doc_contents = my_utils.gpu(my_utils.numpy2tensor(doc_contents, dtype=torch.int), self._use_cuda)
            additional_data[KeyWordSettings.QueryIDs] = np.array([query] * len(labels))
            additional_data[KeyWordSettings.DocIDs] = np.array(docs)
            additional_data[KeyWordSettings.UseCuda] = self._use_cuda

            predictions = self._net.predict(query_content, doc_contents, **additional_data)
            if output_ranking:
                assert len(predictions.shape) == 1
                _, M2 = doc_images.shape
                predictions = predictions.reshape(len(doc_lens), 1 + (len(query_images_indices) * M2))
                predictions, visual_sims = predictions[:, 0], predictions[:, 1:]
                visual_sims = visual_sims.reshape(len(doc_lens), len(query_images_indices), M2)
                visual_sims = visual_sims.transpose(0, 2, 1)  # (B, M2, M1)

            t4 = time.time()
            # print("Computing time of each query: ", (t4 - t3), "seconds")
            ndcg_mz = ndcg_metric(K)(labels, predictions)
            ndcgs_at_1.append(ndcg_metric(1)(labels, predictions))
            ndcgs.append(ndcg_mz)
            positive_docs = set([d for d, lab in zip(docs, labels) if lab == 1])
            indices = np.argsort(-predictions)[:K]  # indices of items with highest scores
            docs = np.array(docs)
            ranked_docs = docs[indices]
            if output_ranking:
                labels = np.array(labels)
                ranked_labels = labels[indices]
                scores = predictions[indices]
                visual_scores = visual_sims[indices]  # (B, M2 * M1)) due to transpose
                assert scores.shape == ranked_labels.shape
                ranked_doc_list = [{KeyWordSettings.Doc_cID: int(d),
                                    KeyWordSettings.Doc_URL: self.index2docs[int(d)],
                                    KeyWordSettings.Doc_cLabel: int(lab),
                                    KeyWordSettings.Doc_wImages: ["%s %s" % (x, str(y)) for x, y in
                                                                  zip(list(map(self.image_loader.right_img_index2path.get,
                                                                               testRatings.dict_doc_imgages[d])), visual_score.tolist())],
                                    KeyWordSettings.Doc_wContent: testRatings.dict_doc_raw_contents[d],
                                    KeyWordSettings.Relevant_Score: float(score)}
                                   for d, lab, score, visual_score in zip(ranked_docs, ranked_labels, scores, visual_scores)]

                q_details = {KeyWordSettings.Query_id: int(query),
                             KeyWordSettings.Query_TweetID: "http://twitter.com/user/status/" + self.index2queries[int(query)],
                             KeyWordSettings.Query_Images: list(map(self.image_loader.left_img_index2path.get, query_images_indices)),
                             KeyWordSettings.Ranked_Docs: ranked_doc_list,
                             KeyWordSettings.Query_Content: testRatings.dict_query_raw_contents[query]}
                list_error_analysis.append(q_details)

            hit = my_evaluator.getHitRatioForList(ranked_docs, positive_docs)
            # ndcg_mine = getNDCGForList(ranklist, positive_docs)
            hits.append(hit)
            # assert abs(ndcg_mine - ndcg_mz) < 1e-10, (ndcg_mine, ndcg_mz)

        results = {}
        results["ndcg"] = np.nanmean(ndcgs)
        results["ndcg_list"] = ndcgs
        results["hits"] = np.nanmean(hits)
        results["hits_list"] = hits
        results["ndcg@1"] = np.nanmean(ndcgs_at_1)
        results["ndcg@1_list"] = ndcgs_at_1

        if output_ranking: return results, sorted(list_error_analysis, key=lambda x: x["qid"])
        return results
Пример #30
0
def evaluate(model: BaseModel,
             testRatings: interactions.MatchInteraction,
             K: int,
             _use_cuda,
             output_ranking=False):
    """
    We could extend it to add more metrics in the future
    Parameters
    ----------
    model: a fitter (not wise)
    testRatings: the
    K: top k ranked documents
    output_ranking: output the ranked docs with respect to a query for error analysis

    Returns
    -------

    """
    ndcg_metric = normalized_discounted_cumulative_gain.NormalizedDiscountedCumulativeGain

    hits, ndcgs = [], []
    list_error_analysis = []
    for query, candidates in testRatings.unique_queries_test.items():
        docs, labels, doc_contents, _ = candidates
        query_content = testRatings.dict_query_contents[query]
        query_len = [testRatings.dict_query_lengths[query]] * len(labels)
        doc_lens = [testRatings.dict_doc_lengths[d] for d in docs]

        query_idfs = None
        if len(TFIDF.get_term_idf()) > 0:
            query_idf_dict = TFIDF.get_term_idf()
            query_idfs = [
                query_idf_dict.get(int(word_idx), 0.0)
                for word_idx in query_content
            ]
            query_idfs = np.tile(query_idfs, (len(labels), 1))
            query_idfs = my_utils.gpu(
                torch.from_numpy(np.array(query_idfs)).float(), _use_cuda)

        query_content = np.tile(
            query_content, (len(labels), 1))  # len(labels), query_contnt_leng)
        doc_contents = np.array(doc_contents)
        query_content = my_utils.gpu(query_content)
        doc_contents = my_utils.gpu(doc_contents)

        query_content = my_utils.gpu(
            my_utils.numpy2tensor(query_content, dtype=torch.int), _use_cuda)
        doc_contents = my_utils.gpu(
            my_utils.numpy2tensor(doc_contents, dtype=torch.int), _use_cuda)

        predictions = model.predict(query_content,
                                    doc_contents,
                                    query_lens=query_len,
                                    docs_lens=doc_lens,
                                    query_idf=query_idfs)
        ndcg_mz = ndcg_metric(K)(labels, predictions)
        ndcgs.append(ndcg_mz)
        positive_docs = set([d for d, lab in zip(docs, labels) if lab == 1])
        indices = np.argsort(
            -predictions)[:K]  # indices of items with highest scores
        docs = np.array(docs)
        ranked_docs = docs[indices]
        if output_ranking:
            labels = np.array(labels)
            ranked_labels = labels[indices]
            scores = predictions[indices]
            assert scores.shape == ranked_labels.shape
            ranked_doc_list = [{
                KeyWordSettings.Doc_cID:
                int(d),
                KeyWordSettings.Doc_cLabel:
                int(lab),
                KeyWordSettings.Doc_wImages: [],
                KeyWordSettings.Doc_wContent:
                testRatings.dict_doc_raw_contents[d],
                KeyWordSettings.Relevant_Score:
                float(score)
            } for d, lab, score in zip(ranked_docs, ranked_labels, scores)]

            q_details = {
                KeyWordSettings.Query_id:
                int(query),
                KeyWordSettings.Query_Images: [],
                KeyWordSettings.Ranked_Docs:
                ranked_doc_list,
                KeyWordSettings.Query_Content:
                testRatings.dict_query_raw_contents[query]
            }
            list_error_analysis.append(q_details)

        hit = getHitRatioForList(ranked_docs, positive_docs)
        hits.append(hit)
        # assert abs(ndcg_mine - ndcg_mz) < 1e-10, (ndcg_mine, ndcg_mz)

    results = {}
    results["ndcg"] = np.nanmean(ndcgs)
    results["ndcg_list"] = ndcgs
    results["hits"] = np.nanmean(hits)
    results["hits_list"] = hits

    if output_ranking:
        return results, sorted(list_error_analysis, key=lambda x: x["qid"])
    return results