예제 #1
0
    def train(self, emails, val_data, w2v, epochs=10, save_model=True):
        optimizer = optim.RMSprop(self.parameters(),
                                  lr=1e-3,
                                  alpha=0.99,
                                  momentum=0.0)

        for epoch in range(epochs):
            epoch_loss = 0.0
            start = time.time()
            # loop over each mail
            for i in range(len(emails)):
                optimizer.zero_grad()
                loss, valid = self.predict(emails[i, :], w2v)
                if valid:
                    # propagate the loss backward and compute the gradient
                    loss.backward()
                    # change weights based on gradient value
                    optimizer.step()
                    epoch_loss += loss.data.numpy()
            end = time.time()
            print 'time taken for epoch:', (end - start)
            print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss)

        if save_model:
            file_name = constants.RUN_ID + '_model.pth'
            self.save(file_name)
        email_ids, embs = self.extract_user_embeddings()
        utils.save_user_embeddings(email_ids, embs)
        plots.plot_with_tsne(email_ids, embs, display_hover=False)
예제 #2
0
    def train(self, emails, w2v, epochs=10, save_model=True):
        loss_criteria = nn.MSELoss()
        optimizer = optim.RMSprop(self.parameters(),
                                  lr=0.0001,
                                  alpha=0.99,
                                  momentum=0.0)
        # optimizer = optim.Adam(self.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)
        email_reps = w2v.get_email_reps(emails, average=True)

        for epoch in range(epochs):
            print 'running epoch ', epoch
            start = time.time()
            epoch_loss = 0.0
            for i in range(len(emails)):
                sender_id = utils.get_userid(emails[i, constants.SENDER_EMAIL])
                # if no word_rep was found for any of the words in the emails, ignore this case
                if type(email_reps[i]) == type(None):
                    continue
                # gets the average email embedding based on word embeddings of all the words in the mail
                email_rep = email_reps[i]
                recv_list = emails[i, constants.RECEIVER_EMAILS].split('|')
                for recv in recv_list:
                    optimizer.zero_grad()
                    recv_id = utils.get_userid(recv)
                    # if sender or receiver is not an enron email id, we ignore this data point
                    if sender_id is None or recv_id is None:
                        continue
                    # if valid sender and receiver pairs have been found update their frequencies
                    self.emailid_train_freq[emails[
                        i,
                        constants.SENDER_EMAIL]] = self.emailid_train_freq.get(
                            emails[i, constants.SENDER_EMAIL], 0) + 1
                    self.emailid_train_freq[
                        recv] = self.emailid_train_freq.get(recv, 0) + 1
                    # do the forward pass
                    pred_email_rep = self.forward(
                        autograd.Variable(torch.LongTensor([sender_id])),
                        autograd.Variable(torch.LongTensor([recv_id])))
                    # compute the loss
                    loss = loss_criteria(
                        pred_email_rep,
                        autograd.Variable(torch.from_numpy(email_rep)))
                    # propagate the loss backward and compute the gradient
                    loss.backward()
                    # change weights based on gradient value
                    optimizer.step()
                    epoch_loss += loss.data.numpy()
            end = time.time()
            print 'time taken ', (end - start)
            print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss)

        if save_model:
            file_name = constants.RUN_ID + '_model.pth'
            self.save(file_name)
        email_ids, embs = self.extract_user_embeddings()
        utils.save_user_embeddings(email_ids, embs)
        # utils.get_similar_users(email_ids, embs)
        plots.plot_with_tsne(email_ids, embs, display_hover=False)
예제 #3
0
    def train(self, emails, val_data, w2v, epochs=10, save_model=True):
        optimizer = optim.RMSprop(self.parameters(),
                                  lr=0.001,
                                  alpha=0.99,
                                  momentum=0.0)
        pos_label = autograd.Variable(torch.LongTensor(
            [1]))  # labels for correct mails
        neg_label = autograd.Variable(torch.LongTensor(
            [0]))  # labels for incorrect mails

        neg_emails = dal.get_negative_emails(emails, fraction=1.0)

        for epoch in range(epochs):
            epoch_loss = 0.0
            start = time.time()
            for i in range(len(emails)):
                optimizer.zero_grad()
                loss, valid = self.predict(emails[i, :],
                                           w2v,
                                           label=pos_label,
                                           training_mode=True)
                if valid:
                    # propagate the loss backward and compute the gradient
                    loss.backward()
                    # change weights based on gradient value
                    optimizer.step()
                    epoch_loss += loss.data.numpy()
                    optimizer.zero_grad()

                loss, valid = self.predict(neg_emails[i, :],
                                           w2v,
                                           label=neg_label,
                                           training_mode=True)
                if valid:
                    # propagate the loss backward and compute the gradient
                    loss.backward()
                    # change weights based on gradient value
                    optimizer.step()
                    epoch_loss += loss.data.numpy()
            end = time.time()
            print 'time taken for epoch : ', (end - start)
            print 'loss in epoch ' + str(epoch) + ' = ' + str(epoch_loss)

        if save_model:
            file_name = constants.RUN_ID + '_model.pth'
            self.save(file_name)
        email_ids, embs = self.extract_user_embeddings()
        utils.save_user_embeddings(email_ids, embs)
        # utils.get_similar_users(email_ids, embs)
        plots.plot_with_tsne(email_ids, embs, display_hover=False)