예제 #1
0
    def __init__(self, seq_len=20, learning_rate=3e-4):
        device = torch.device(
            "cuda: 0" if torch.cuda.is_available() else "cpu")
        self.device = device
        self.seq_len = seq_len
        time_stamp = time.strftime("%m-%d-%Y_%H:%M:%S", time.localtime())
        print("run on device", device, ",current time:", time_stamp)
        self.writer = SummaryWriter('runs/emb_graph' + time_stamp)

        # define layers
        self.categ_embedding = CategoricalEmbedding().to(device)
        self.r2s_embedding = Route2Stop(vertex_feature=105,
                                        edge_feature=112).to(device)
        self.encoder = Encoder(input_size=100, seq_len=seq_len).to(device)
        self.fcn = FCN(input_size=100).to(device)
        self.similarity = Similarity(input_size=30, device=device).to(device)

        # define training parameters
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(
            [{
                'params': self.categ_embedding.parameters()
            }, {
                'params': self.r2s_embedding.parameters()
            }, {
                'params': self.encoder.parameters()
            }, {
                'params': self.fcn.parameters()
            }, {
                'params': self.similarity.parameters()
            }],
            lr=learning_rate)
예제 #2
0
def add_similarities_to_db(city_id):

    results = db.session.query(WikipediaPage.city_id) \
        .filter(WikipediaPage.city_id != city_id).all()

    city_ids = [result[0] for result in results]

    combo_ids = [
        '%s-%s' % (city_id, city_ids[i]) for i in range(len(city_ids))
    ]

    for combo_id in combo_ids:
        id1, id2 = combo_id.split("-")

        page1 = WikipediaPage.query.filter_by(city_id=id1).one()
        page2 = WikipediaPage.query.filter_by(city_id=id2).one()

        similarity_score = cosine_similarity(page1.content, page2.content)

        similarity = Similarity(combo_id=combo_id,
                                city_id_1=id1,
                                city_id_2=id2,
                                similarity=similarity_score)
        db.session.add(similarity)

    db.session.commit()
예제 #3
0
    def store_similarity(self, user_id1, user_id2, similarity):
        session = Session()
        try:
            sim = Similarity(user_id1=user_id1,
                             user_id2=user_id2,
                             similarity=similarity)
            session.add(sim)
            session.commit()

        except Exception as e:
            print e
            session.rollback()

        finally:
            session.close()
예제 #4
0
def add_similarities_to_db(city_ids):

    combo_ids = ['%s-%s' % (city_ids[i], city_ids[j])
                 for i in range(len(city_ids))
                 for j in range(i+1, len(city_ids))]
    for combo_id in combo_ids:
        id1, id2 = combo_id.split("-")

        page1 = WikipediaPage.query.filter_by(city_id=id1).one()
        page2 = WikipediaPage.query.filter_by(city_id=id2).one()

        similarity_score = cosine_similarity(page1.content, page2.content)

        similarity = Similarity(combo_id=combo_id, city_id_1=id1, city_id_2=id2, similarity=similarity_score)
        db.session.add(similarity)

    db.session.commit()
예제 #5
0
class Model:
    def __init__(self, seq_len=20, learning_rate=3e-4):
        device = torch.device(
            "cuda: 0" if torch.cuda.is_available() else "cpu")
        self.device = device
        self.seq_len = seq_len
        time_stamp = time.strftime("%m-%d-%Y_%H:%M:%S", time.localtime())
        print("run on device", device, ",current time:", time_stamp)
        self.writer = SummaryWriter('runs/emb_graph' + time_stamp)

        # define layers
        self.categ_embedding = CategoricalEmbedding().to(device)
        self.r2s_embedding = Route2Stop(vertex_feature=105,
                                        edge_feature=112).to(device)
        self.encoder = Encoder(input_size=100, seq_len=seq_len).to(device)
        self.fcn = FCN(input_size=100).to(device)
        self.similarity = Similarity(input_size=30, device=device).to(device)

        # define training parameters
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(
            [{
                'params': self.categ_embedding.parameters()
            }, {
                'params': self.r2s_embedding.parameters()
            }, {
                'params': self.encoder.parameters()
            }, {
                'params': self.fcn.parameters()
            }, {
                'params': self.similarity.parameters()
            }],
            lr=learning_rate)

    def forward(self, old, real, fake, numer_list, categ_list):

        old = self.categ_embedding(old, numer_list, categ_list, self.device)
        real = self.categ_embedding(real, numer_list, categ_list, self.device)
        fake = self.categ_embedding(fake, numer_list, categ_list, self.device)

        old = self.r2s_embedding(old)
        real = self.r2s_embedding(real)
        fake = self.r2s_embedding(fake)

        old = self.encoder(old)
        real = self.fcn(real)
        fake = self.fcn(fake)

        score_real = self.similarity(old, real)
        score_fake = self.similarity(old, fake)
        return score_real, score_fake

    def metrics(self, score_real, score_fake, label_real_test,
                label_fake_test):
        y_true = np.concatenate(
            [label_real_test.cpu().numpy(),
             label_fake_test.cpu().numpy()],
            axis=0)
        y_pred = torch.cat([
            torch.argmax(score_real, dim=1, keepdim=True),
            torch.argmax(score_fake, dim=1, keepdim=True)
        ],
                           dim=0).cpu().numpy()
        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        return acc, precision, recall, f1

    def train_and_test(self, data, batch_size=64, num_epoch=50):
        #initialize labels before training
        label_real = torch.cat(
            [torch.zeros([batch_size, 1]),
             torch.ones([batch_size, 1])], dim=1).to(self.device)
        label_fake = torch.cat(
            [torch.ones([batch_size, 1]),
             torch.zeros([batch_size, 1])], dim=1).to(self.device)

        old_test, real_test, fake_test = data.test
        test_size = real_test.shape[0]
        label_real_test = torch.ones([test_size,
                                      1]).type(torch.long).to(self.device)
        label_fake_test = torch.zeros([test_size,
                                       1]).type(torch.long).to(self.device)

        for epoch in range(num_epoch):
            total_loss = [0] * len(data)
            total_loss_real = [0] * len(data)
            # training first
            for i, chunk in enumerate(data.train):
                old_chunk, real_chunk, fake_chunk = chunk
                num_batch = real_chunk.shape[0] // batch_size
                for batch in range(num_batch):
                    # get a batch of data pair: (old, real, fake)
                    old_batch = old_chunk.iloc[batch * self.seq_len *
                                               batch_size:(batch + 1) *
                                               self.seq_len * batch_size, :]
                    real_batch = real_chunk.iloc[batch *
                                                 batch_size:(batch + 1) *
                                                 batch_size, :]
                    fake_batch = fake_chunk.iloc[batch *
                                                 batch_size:(batch + 1) *
                                                 batch_size, :]

                    score_real, score_fake = self.forward(
                        old_batch, real_batch, fake_batch, data.numer_list,
                        data.categ_list)

                    loss_real = self.criterion(score_real, label_real)
                    loss_fake = self.criterion(score_fake, label_fake)
                    loss = loss_real + loss_fake

                    total_loss[i] += loss.data
                    total_loss_real[i] += loss_real.data
                    self.optimizer.zero_grad()

                    loss.backward()
                    self.optimizer.step()

                    if (batch + 1) % 100 == 0:
                        print(
                            "epoch: %d, chunk: %d, batch: %d, loss: %.3f, real: %.3f, fake: %.3f"
                            % (epoch, i, batch + 1, loss.data, loss_real.data,
                               loss_fake.data))
                total_loss[i] = (total_loss[i] / batch).cpu().numpy()
                total_loss_real[i] = (total_loss_real[i] / batch).cpu().numpy()

            # testing
            score_real, score_fake = self.forward(old_test, real_test,
                                                  fake_test, data.numer_list,
                                                  data.categ_list)
            acc, precision, recall, f1 = self.metrics(score_real, score_fake,
                                                      label_real_test,
                                                      label_fake_test)
            print("test acc: %.4f" % acc)
            self.writer.add_scalar('testing accuracy', acc, epoch)
            self.writer.close()
            # print result and save loss in tensorboard
            print("epoch: %d, average loss: %.4f" %
                  (epoch, np.mean(total_loss)))
            self.writer.add_scalars('training loss', {
                'overall': np.mean(total_loss),
                'good': np.mean(total_loss_real)
            }, epoch)
            self.writer.close()
            return acc, precision, recall, f1
예제 #6
0
def recommend_train(context_text_encoder: TextEncoder,
                    context_image_encoder: ImageEncoder,
                    context_encoder: ContextEncoder,
                    train_dataset: Dataset,
                    valid_dataset: Dataset,
                    test_dataset: Dataset,
                    model_file: str,
                    vocab_size: int,
                    embed_init=None):
    """Recommend train.
    Args:
        context_text_encoder (TextEncoder): Context text encoder.
        context_image_encoder (ImageEncoder): Context image encoder.
        context_encoder (ContextEncoder): Context encoder.
        train_dataset (Dataset): Train dataset.
        valid_dataset (Dataset): Valid dataset.
        test_dataset (Dataset): Test dataset.
        model_file (str): Saved model file.
        vocab_size (int): Vocabulary size.
        embed_init: Initial embedding (vocab_size, embed_size).
    """

    # Data loader.
    train_data_loader = DataLoader(
        dataset=train_dataset,
        batch_size=RecommendTrainConfig.batch_size,
        shuffle=True,
        num_workers=RecommendTrainConfig.num_data_loader_workers)

    # Model.
    similarity_config = SimilarityConfig(vocab_size, embed_init)
    similarity = Similarity(similarity_config).to(GlobalConfig.device)

    # Model parameters.
    params = list(
        chain.from_iterable([
            list(model.parameters()) for model in [
                context_text_encoder, context_image_encoder, context_encoder,
                similarity
            ]
        ]))

    optimizer = Adam(params, lr=RecommendTrainConfig.learning_rate)
    epoch_id = 0
    min_valid_loss = None

    # Load saved state.
    if isfile(model_file):
        state = torch.load(model_file)
        similarity.load_state_dict(state['similarity'])
        optimizer.load_state_dict(state['optimizer'])
        epoch_id = state['epoch_id']
        min_valid_loss = state['min_valid_loss']

    # Loss.
    sum_loss = 0
    bad_loss_cnt = 0

    # Switch to train mode.
    context_text_encoder.train()
    context_image_encoder.train()
    context_encoder.train()
    similarity.train()

    finished = False

    for epoch_id in range(epoch_id, RecommendTrainConfig.num_iterations):
        for batch_id, train_data in enumerate(train_data_loader):
            # Sets gradients to 0.
            optimizer.zero_grad()

            context_dialog, pos_products, neg_products = train_data

            texts, text_lengths, images, utter_types = context_dialog
            # Sizes:
            # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len)
            # text_lengths: (batch_size, dialog_context_size + 1)
            # images: (batch_size, dialog_context_size + 1,
            #          pos_images_max_num, 3, image_size, image_size)
            # utter_types: (batch_size, )

            batch_size = texts.size(0)

            # To device.
            texts = texts.to(GlobalConfig.device)
            text_lengths = text_lengths.to(GlobalConfig.device)
            images = images.to(GlobalConfig.device)
            # utter_types = utter_types.to(GlobalConfig.device)

            texts.transpose_(0, 1)
            # (dialog_context_size + 1, batch_size, dialog_text_max_len)

            text_lengths.transpose_(0, 1)
            # (dialog_context_size + 1, batch_size)

            images.transpose_(0, 1)
            images.transpose_(1, 2)
            # (dialog_context_size + 1, pos_images_max_num, batch_size, 3,
            #  image_size, image_size)

            # Encode context.
            context, _ = encode_context(context_text_encoder,
                                        context_image_encoder, context_encoder,
                                        texts, text_lengths, images)
            # (batch_size, context_vector_size)

            loss = recommend_loss(similarity, batch_size, context,
                                  pos_products, neg_products)
            sum_loss += loss

            loss.backward()
            optimizer.step()

            # Print loss every `TrainConfig.print_freq` batches.
            if (batch_id + 1) % RecommendTrainConfig.print_freq == 0:
                cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                sum_loss /= RecommendTrainConfig.print_freq
                print('epoch: {} \tbatch: {} \tloss: {} \ttime: {}'.format(
                    epoch_id + 1, batch_id + 1, sum_loss, cur_time))
                sum_loss = 0

            # Valid every `TrainConfig.valid_freq` batches.
            if (batch_id + 1) % RecommendTrainConfig.valid_freq == 0:
                valid_loss = recommend_valid(context_text_encoder,
                                             context_image_encoder,
                                             context_encoder, similarity,
                                             valid_dataset)
                cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                print('valid_loss: {} \ttime: {}'.format(valid_loss, cur_time))

                # Save current best model.
                if min_valid_loss is None or valid_loss < min_valid_loss:
                    min_valid_loss = valid_loss
                    bad_loss_cnt = 0
                    save_dict = {
                        'task': RECOMMEND_TASK,
                        'epoch_id': epoch_id,
                        'min_valid_loss': min_valid_loss,
                        'optimizer': optimizer.state_dict(),
                        'context_text_encoder':
                        context_text_encoder.state_dict(),
                        'context_image_encoder':
                        context_image_encoder.state_dict(),
                        'context_encoder': context_encoder.state_dict(),
                        'similarity': similarity.state_dict()
                    }
                    torch.save(save_dict, model_file)
                    print('Best model saved.')
                else:
                    bad_loss_cnt += 1

                if bad_loss_cnt > RecommendTrainConfig.patience:
                    recommend_test(context_text_encoder, context_image_encoder,
                                   context_encoder, similarity, test_dataset)
                    finished = True
                    break
        if finished:
            break
예제 #7
0
def recommend_valid(
        context_text_encoder: TextEncoder,
        context_image_encoder: ImageEncoder,
        context_encoder: ContextEncoder,
        similarity: Similarity,
        valid_dataset: Dataset):
    """Recommend valid.

    Args:
        context_text_encoder (TextEncoder): Context text encoder.
        context_image_encoder (ImageEncoder): Context image encoder.
        context_encoder (ContextEncoder): Context encoder.
        similarity (Similarity): Intention.
        valid_dataset (Dataset): Valid dataset.

    """

    # Valid dataset loader.
    valid_data_loader = DataLoader(
        valid_dataset,
        batch_size=RecommendValidConfig.batch_size,
        shuffle=True,
        num_workers=RecommendValidConfig.num_data_loader_workers
    )

    sum_loss = 0
    num_batches = 0

    # Switch to eval mode.
    context_text_encoder.eval()
    context_image_encoder.eval()
    context_encoder.eval()
    # similarity.eval()
    # There might be a bug in the implement of resnet.

    num_ranks = torch.zeros(DatasetConfig.neg_images_max_num + 1,
                            dtype=torch.long)
    num_ranks = num_ranks.to(GlobalConfig.device)
    total_samples = 0

    with torch.no_grad():
        for batch_id, valid_data in enumerate(valid_data_loader):
            # Only valid `ValidConfig.num_batches` batches.
            if batch_id >= RecommendValidConfig.num_batches:
                break
            num_batches += 1

            context_dialog, pos_products, neg_products = valid_data

            texts, text_lengths, images, utter_types = context_dialog
            # Sizes:
            # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len)
            # text_lengths: (batch_size, dialog_context_size + 1)
            # images: (batch_size, dialog_context_size + 1,
            #          pos_images_max_num, 3, image_size, image_size)
            # utter_types: (batch_size, )

            batch_size = texts.size(0)

            # To device.
            texts = texts.to(GlobalConfig.device)
            text_lengths = text_lengths.to(GlobalConfig.device)
            images = images.to(GlobalConfig.device)
            # utter_types = utter_types.to(GlobalConfig.device)

            texts.transpose_(0, 1)
            # (dialog_context_size + 1, batch_size, dialog_text_max_len)

            text_lengths.transpose_(0, 1)
            # (dialog_context_size + 1, batch_size)

            images.transpose_(0, 1)
            images.transpose_(1, 2)
            # (dialog_context_size + 1, pos_images_max_num, batch_size, 3,
            #  image_size, image_size)

            # Encode context.
            context, _ = encode_context(
                context_text_encoder,
                context_image_encoder,
                context_encoder,
                texts,
                text_lengths,
                images
            )
            # (batch_size, context_vector_size)

            loss = recommend_loss(similarity, batch_size, context,
                                  pos_products, neg_products)
            sum_loss += loss

            num_rank = recommend_eval(
                similarity,
                batch_size,
                context,
                pos_products,
                neg_products
            )

            total_samples += batch_size

            num_ranks += num_rank

    for i in range(DatasetConfig.neg_images_max_num):
        print('total recall@{} = {}'.format(
            i + 1,
            torch.sum(num_ranks[:i + 1]).item() / total_samples))

    # Switch to train mode.
    context_text_encoder.train()
    context_image_encoder.train()
    context_encoder.train()
    similarity.train()

    return sum_loss / num_batches