def __init__(self, seq_len=20, learning_rate=3e-4): device = torch.device( "cuda: 0" if torch.cuda.is_available() else "cpu") self.device = device self.seq_len = seq_len time_stamp = time.strftime("%m-%d-%Y_%H:%M:%S", time.localtime()) print("run on device", device, ",current time:", time_stamp) self.writer = SummaryWriter('runs/emb_graph' + time_stamp) # define layers self.categ_embedding = CategoricalEmbedding().to(device) self.r2s_embedding = Route2Stop(vertex_feature=105, edge_feature=112).to(device) self.encoder = Encoder(input_size=100, seq_len=seq_len).to(device) self.fcn = FCN(input_size=100).to(device) self.similarity = Similarity(input_size=30, device=device).to(device) # define training parameters self.criterion = nn.BCELoss() self.optimizer = optim.Adam( [{ 'params': self.categ_embedding.parameters() }, { 'params': self.r2s_embedding.parameters() }, { 'params': self.encoder.parameters() }, { 'params': self.fcn.parameters() }, { 'params': self.similarity.parameters() }], lr=learning_rate)
def add_similarities_to_db(city_id): results = db.session.query(WikipediaPage.city_id) \ .filter(WikipediaPage.city_id != city_id).all() city_ids = [result[0] for result in results] combo_ids = [ '%s-%s' % (city_id, city_ids[i]) for i in range(len(city_ids)) ] for combo_id in combo_ids: id1, id2 = combo_id.split("-") page1 = WikipediaPage.query.filter_by(city_id=id1).one() page2 = WikipediaPage.query.filter_by(city_id=id2).one() similarity_score = cosine_similarity(page1.content, page2.content) similarity = Similarity(combo_id=combo_id, city_id_1=id1, city_id_2=id2, similarity=similarity_score) db.session.add(similarity) db.session.commit()
def store_similarity(self, user_id1, user_id2, similarity): session = Session() try: sim = Similarity(user_id1=user_id1, user_id2=user_id2, similarity=similarity) session.add(sim) session.commit() except Exception as e: print e session.rollback() finally: session.close()
def add_similarities_to_db(city_ids): combo_ids = ['%s-%s' % (city_ids[i], city_ids[j]) for i in range(len(city_ids)) for j in range(i+1, len(city_ids))] for combo_id in combo_ids: id1, id2 = combo_id.split("-") page1 = WikipediaPage.query.filter_by(city_id=id1).one() page2 = WikipediaPage.query.filter_by(city_id=id2).one() similarity_score = cosine_similarity(page1.content, page2.content) similarity = Similarity(combo_id=combo_id, city_id_1=id1, city_id_2=id2, similarity=similarity_score) db.session.add(similarity) db.session.commit()
class Model: def __init__(self, seq_len=20, learning_rate=3e-4): device = torch.device( "cuda: 0" if torch.cuda.is_available() else "cpu") self.device = device self.seq_len = seq_len time_stamp = time.strftime("%m-%d-%Y_%H:%M:%S", time.localtime()) print("run on device", device, ",current time:", time_stamp) self.writer = SummaryWriter('runs/emb_graph' + time_stamp) # define layers self.categ_embedding = CategoricalEmbedding().to(device) self.r2s_embedding = Route2Stop(vertex_feature=105, edge_feature=112).to(device) self.encoder = Encoder(input_size=100, seq_len=seq_len).to(device) self.fcn = FCN(input_size=100).to(device) self.similarity = Similarity(input_size=30, device=device).to(device) # define training parameters self.criterion = nn.BCELoss() self.optimizer = optim.Adam( [{ 'params': self.categ_embedding.parameters() }, { 'params': self.r2s_embedding.parameters() }, { 'params': self.encoder.parameters() }, { 'params': self.fcn.parameters() }, { 'params': self.similarity.parameters() }], lr=learning_rate) def forward(self, old, real, fake, numer_list, categ_list): old = self.categ_embedding(old, numer_list, categ_list, self.device) real = self.categ_embedding(real, numer_list, categ_list, self.device) fake = self.categ_embedding(fake, numer_list, categ_list, self.device) old = self.r2s_embedding(old) real = self.r2s_embedding(real) fake = self.r2s_embedding(fake) old = self.encoder(old) real = self.fcn(real) fake = self.fcn(fake) score_real = self.similarity(old, real) score_fake = self.similarity(old, fake) return score_real, score_fake def metrics(self, score_real, score_fake, label_real_test, label_fake_test): y_true = np.concatenate( [label_real_test.cpu().numpy(), label_fake_test.cpu().numpy()], axis=0) y_pred = torch.cat([ torch.argmax(score_real, dim=1, keepdim=True), torch.argmax(score_fake, dim=1, keepdim=True) ], dim=0).cpu().numpy() acc = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) return acc, precision, recall, f1 def train_and_test(self, data, batch_size=64, num_epoch=50): #initialize labels before training label_real = torch.cat( [torch.zeros([batch_size, 1]), torch.ones([batch_size, 1])], dim=1).to(self.device) label_fake = torch.cat( [torch.ones([batch_size, 1]), torch.zeros([batch_size, 1])], dim=1).to(self.device) old_test, real_test, fake_test = data.test test_size = real_test.shape[0] label_real_test = torch.ones([test_size, 1]).type(torch.long).to(self.device) label_fake_test = torch.zeros([test_size, 1]).type(torch.long).to(self.device) for epoch in range(num_epoch): total_loss = [0] * len(data) total_loss_real = [0] * len(data) # training first for i, chunk in enumerate(data.train): old_chunk, real_chunk, fake_chunk = chunk num_batch = real_chunk.shape[0] // batch_size for batch in range(num_batch): # get a batch of data pair: (old, real, fake) old_batch = old_chunk.iloc[batch * self.seq_len * batch_size:(batch + 1) * self.seq_len * batch_size, :] real_batch = real_chunk.iloc[batch * batch_size:(batch + 1) * batch_size, :] fake_batch = fake_chunk.iloc[batch * batch_size:(batch + 1) * batch_size, :] score_real, score_fake = self.forward( old_batch, real_batch, fake_batch, data.numer_list, data.categ_list) loss_real = self.criterion(score_real, label_real) loss_fake = self.criterion(score_fake, label_fake) loss = loss_real + loss_fake total_loss[i] += loss.data total_loss_real[i] += loss_real.data self.optimizer.zero_grad() loss.backward() self.optimizer.step() if (batch + 1) % 100 == 0: print( "epoch: %d, chunk: %d, batch: %d, loss: %.3f, real: %.3f, fake: %.3f" % (epoch, i, batch + 1, loss.data, loss_real.data, loss_fake.data)) total_loss[i] = (total_loss[i] / batch).cpu().numpy() total_loss_real[i] = (total_loss_real[i] / batch).cpu().numpy() # testing score_real, score_fake = self.forward(old_test, real_test, fake_test, data.numer_list, data.categ_list) acc, precision, recall, f1 = self.metrics(score_real, score_fake, label_real_test, label_fake_test) print("test acc: %.4f" % acc) self.writer.add_scalar('testing accuracy', acc, epoch) self.writer.close() # print result and save loss in tensorboard print("epoch: %d, average loss: %.4f" % (epoch, np.mean(total_loss))) self.writer.add_scalars('training loss', { 'overall': np.mean(total_loss), 'good': np.mean(total_loss_real) }, epoch) self.writer.close() return acc, precision, recall, f1
def recommend_train(context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, train_dataset: Dataset, valid_dataset: Dataset, test_dataset: Dataset, model_file: str, vocab_size: int, embed_init=None): """Recommend train. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. train_dataset (Dataset): Train dataset. valid_dataset (Dataset): Valid dataset. test_dataset (Dataset): Test dataset. model_file (str): Saved model file. vocab_size (int): Vocabulary size. embed_init: Initial embedding (vocab_size, embed_size). """ # Data loader. train_data_loader = DataLoader( dataset=train_dataset, batch_size=RecommendTrainConfig.batch_size, shuffle=True, num_workers=RecommendTrainConfig.num_data_loader_workers) # Model. similarity_config = SimilarityConfig(vocab_size, embed_init) similarity = Similarity(similarity_config).to(GlobalConfig.device) # Model parameters. params = list( chain.from_iterable([ list(model.parameters()) for model in [ context_text_encoder, context_image_encoder, context_encoder, similarity ] ])) optimizer = Adam(params, lr=RecommendTrainConfig.learning_rate) epoch_id = 0 min_valid_loss = None # Load saved state. if isfile(model_file): state = torch.load(model_file) similarity.load_state_dict(state['similarity']) optimizer.load_state_dict(state['optimizer']) epoch_id = state['epoch_id'] min_valid_loss = state['min_valid_loss'] # Loss. sum_loss = 0 bad_loss_cnt = 0 # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() similarity.train() finished = False for epoch_id in range(epoch_id, RecommendTrainConfig.num_iterations): for batch_id, train_data in enumerate(train_data_loader): # Sets gradients to 0. optimizer.zero_grad() context_dialog, pos_products, neg_products = train_data texts, text_lengths, images, utter_types = context_dialog # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) batch_size = texts.size(0) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) # utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context(context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images) # (batch_size, context_vector_size) loss = recommend_loss(similarity, batch_size, context, pos_products, neg_products) sum_loss += loss loss.backward() optimizer.step() # Print loss every `TrainConfig.print_freq` batches. if (batch_id + 1) % RecommendTrainConfig.print_freq == 0: cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") sum_loss /= RecommendTrainConfig.print_freq print('epoch: {} \tbatch: {} \tloss: {} \ttime: {}'.format( epoch_id + 1, batch_id + 1, sum_loss, cur_time)) sum_loss = 0 # Valid every `TrainConfig.valid_freq` batches. if (batch_id + 1) % RecommendTrainConfig.valid_freq == 0: valid_loss = recommend_valid(context_text_encoder, context_image_encoder, context_encoder, similarity, valid_dataset) cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") print('valid_loss: {} \ttime: {}'.format(valid_loss, cur_time)) # Save current best model. if min_valid_loss is None or valid_loss < min_valid_loss: min_valid_loss = valid_loss bad_loss_cnt = 0 save_dict = { 'task': RECOMMEND_TASK, 'epoch_id': epoch_id, 'min_valid_loss': min_valid_loss, 'optimizer': optimizer.state_dict(), 'context_text_encoder': context_text_encoder.state_dict(), 'context_image_encoder': context_image_encoder.state_dict(), 'context_encoder': context_encoder.state_dict(), 'similarity': similarity.state_dict() } torch.save(save_dict, model_file) print('Best model saved.') else: bad_loss_cnt += 1 if bad_loss_cnt > RecommendTrainConfig.patience: recommend_test(context_text_encoder, context_image_encoder, context_encoder, similarity, test_dataset) finished = True break if finished: break
def recommend_valid( context_text_encoder: TextEncoder, context_image_encoder: ImageEncoder, context_encoder: ContextEncoder, similarity: Similarity, valid_dataset: Dataset): """Recommend valid. Args: context_text_encoder (TextEncoder): Context text encoder. context_image_encoder (ImageEncoder): Context image encoder. context_encoder (ContextEncoder): Context encoder. similarity (Similarity): Intention. valid_dataset (Dataset): Valid dataset. """ # Valid dataset loader. valid_data_loader = DataLoader( valid_dataset, batch_size=RecommendValidConfig.batch_size, shuffle=True, num_workers=RecommendValidConfig.num_data_loader_workers ) sum_loss = 0 num_batches = 0 # Switch to eval mode. context_text_encoder.eval() context_image_encoder.eval() context_encoder.eval() # similarity.eval() # There might be a bug in the implement of resnet. num_ranks = torch.zeros(DatasetConfig.neg_images_max_num + 1, dtype=torch.long) num_ranks = num_ranks.to(GlobalConfig.device) total_samples = 0 with torch.no_grad(): for batch_id, valid_data in enumerate(valid_data_loader): # Only valid `ValidConfig.num_batches` batches. if batch_id >= RecommendValidConfig.num_batches: break num_batches += 1 context_dialog, pos_products, neg_products = valid_data texts, text_lengths, images, utter_types = context_dialog # Sizes: # texts: (batch_size, dialog_context_size + 1, dialog_text_max_len) # text_lengths: (batch_size, dialog_context_size + 1) # images: (batch_size, dialog_context_size + 1, # pos_images_max_num, 3, image_size, image_size) # utter_types: (batch_size, ) batch_size = texts.size(0) # To device. texts = texts.to(GlobalConfig.device) text_lengths = text_lengths.to(GlobalConfig.device) images = images.to(GlobalConfig.device) # utter_types = utter_types.to(GlobalConfig.device) texts.transpose_(0, 1) # (dialog_context_size + 1, batch_size, dialog_text_max_len) text_lengths.transpose_(0, 1) # (dialog_context_size + 1, batch_size) images.transpose_(0, 1) images.transpose_(1, 2) # (dialog_context_size + 1, pos_images_max_num, batch_size, 3, # image_size, image_size) # Encode context. context, _ = encode_context( context_text_encoder, context_image_encoder, context_encoder, texts, text_lengths, images ) # (batch_size, context_vector_size) loss = recommend_loss(similarity, batch_size, context, pos_products, neg_products) sum_loss += loss num_rank = recommend_eval( similarity, batch_size, context, pos_products, neg_products ) total_samples += batch_size num_ranks += num_rank for i in range(DatasetConfig.neg_images_max_num): print('total recall@{} = {}'.format( i + 1, torch.sum(num_ranks[:i + 1]).item() / total_samples)) # Switch to train mode. context_text_encoder.train() context_image_encoder.train() context_encoder.train() similarity.train() return sum_loss / num_batches