class TestDataLoader(TestCase): def test_load_data(self): word2index = {'/': 0, '<': 1, '>': 2, 's': 3, '、': 4, '。': 5, 'が': 6, 'た': 7, 'で': 8, 'に': 9, 'の': 10, 'は': 11, 'を': 12} index2word = {0: '/', 1: '<', 2: '>', 3: 's', 4: '、', 5: '。', 6: 'が', 7: 'た', 8: 'で', 9: 'に', 10: 'の', 11: 'は', 12: 'を'} window_data = [('<', '/'), ('<', 's'), ('<', '>'), ('/', '<'), ('/', 's'), ('/', '>'), ('s', '<'), ('s', '/'), ('s', '>'), ('>', '<'), ('>', '/'), ('>', 's')] X_ik = {('/', '<'): 2, ('<', '/'): 2, ('/', '>'): 2, ('>', '/'): 2, ('/', 's'): 2, ('s', '/'): 2, ('<', '>'): 2, ('>', '<'): 2, ('<', 's'): 2, ('s', '<'): 2, ('>', 's'): 2, ('s', '>'): 2} self.test_data_loader = DataLoader() self.test_japanese_wiki_data = 'test/test_data/jawiki_test.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data(file_name=self.test_japanese_wiki_data) # noqa # Reference # https://stackoverflow.com/questions/11026959/writing-a-dict-to-txt-file-and-reading-it-back # noqa APP_PATH = os.path.dirname(__file__) with open(APP_PATH + '/test_data/test_weighting_dict.pkl', 'rb') as handle: # noqa weighting_dict = pickle.loads(handle.read()) print(test_word2index) print(test_index2word) print(test_window_data) print(test_X_ik) assert word2index == test_word2index assert index2word == test_index2word assert window_data == test_window_data assert test_X_ik == X_ik assert test_weightinhg_dict == weighting_dict
class TestTrainer(TestCase): def test_train_method(self): self.test_data_loader = DataLoader() self.test_japanese_wiki_data = 'test/test_data/jawiki_test.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data(file_name=self.test_japanese_wiki_data) # noqa self.test_prepare_train_data = PrepareTrainData() test_train_data = \ self.test_prepare_train_data.prepare_train_data_method( window_data=test_window_data, word2index=test_word2index, weighting_dic=test_weightinhg_dict, X_ik=test_X_ik) self.model = Glove(vocab_size=len(test_word2index)) self.trainer = Trainer(model=self.model) self.trainer.train_method(train_data=test_train_data) word_similarity = self.trainer.word_similarity( target=self.test_data_loader.vocab[0], vocab=self.test_data_loader.vocab, word2index=test_word2index, top_rank=2) word_similarity_check = ['<', '>', 's'] word_similarity_bool = False for word in word_similarity: if word[0] in word_similarity_check: word_similarity_bool = True assert word_similarity_bool is True
class TestClassifier(TestCase): def test_classify(self): model_name = '../models/glove_wiki/glove_model_40.pth' output_file = 'test/test_data/glove_classify_model.pkl' compare_output_file = 'glove_classify_model.pkl' classifier = Classifier(model_name=model_name) classifier.classify() assert True is filecmp.cmp(output_file, compare_output_file) def test_classify_predict(self): self.test_data_loader = DataLoader() self.test_japanese_wiki_data = 'test/test_data/jawiki_test.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data(file_name=self.test_japanese_wiki_data) # noqa model_name = '../models/glove_wiki/glove_model_40.pth' output_file = 'test/test_data/glove_classify_model.pkl' classifier = Classifier(model_name=model_name) print(test_word2index) classes = classifier.classify_predict(word='の', classify_model_name=output_file, word2index=test_word2index) assert 2 == classes classes = classifier.classify_predict(word='どうよ?', classify_model_name=output_file, word2index=test_word2index) assert 9999 == classes
def main(configs: Configs = None, data_loader: DataLoader = None): """ main function for data processor from raw files SAP to tables in database to be consumed by forecast model usage example: $ python spike-challenge/src/make_dataset.py """ if configs is None: configs = Configs('default_config.yaml') if data_loader is None: data_loader = DataLoader() data_loader.load_data()
class TestGloveVisualize(TestCase): def test_visualize(self): self.test_data_loader = DataLoader() self.test_japanese_wiki_data = '../data/raw/jawiki_only_word_random_choose.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data( file_name=self.test_japanese_wiki_data) # noqa model_name = '../models/glove_wiki/glove_model_40.pth' self.test_glove_visualize = GloveVisualize(model_name=model_name) self.test_glove_visualize.visualize(vocab=self.test_data_loader.vocab)
class TestGloveVisualize(TestCase): def test_visualize(self): self.test_data_loader = DataLoader() self.test_japanese_wiki_data = '../data/raw/source_replay_twitter_data_sort.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data( file_name=self.test_japanese_wiki_data) # noqa model_name = '../models/glove_model_40.pth' test_word2index.update({'<UNK>': len(test_word2index)}) self.test_glove_visualize = GloveVisualize(model_name=model_name) self.test_glove_visualize.visualize(vocab=self.test_data_loader.vocab)
def main(): parser = argparse.ArgumentParser(description="Training glove model") parser.add_argument( "-c", "--train_data", metavar="train_data", # type=str, default='../data/raw/jawiki_only_word_random_choose.txt', type=str, default='../data/raw/source_replay_twitter_data_sort.txt', dest="train_data", help="set the training data ") parser.add_argument("-e", "--embedding_size", metavar="embedding_size", type=int, default=300, dest="embedding_size", help="set the embedding size") args = parser.parse_args() data_loader = DataLoader() japanese_wiki_data = args.train_data word2index, index2word, window_data, X_ik, weightinhg_dict = \ data_loader.load_data(file_name=japanese_wiki_data) # noqa print(word2index) prepare_train_data = PrepareTrainData() train_data = \ prepare_train_data.prepare_train_data_method( window_data=window_data, word2index=word2index, weighting_dic=weightinhg_dict, X_ik=X_ik) model = Glove(vocab_size=len(word2index), projection_dim=args.embedding_size) trainer = Trainer(model=model) trainer.train_method(train_data=train_data) word_similarity = trainer.word_similarity(target=data_loader.vocab[0], vocab=data_loader.vocab, word2index=word2index, top_rank=2) print(word_similarity)
class TestPrepareTrainData(TestCase): def test_prepare_train_data_method(self): self.test_data_loader = DataLoader() self.test_japanese_wiki_data = 'test/test_data/jawiki_test.txt' test_word2index, test_index2word, test_window_data, \ test_X_ik, test_weightinhg_dict = self.test_data_loader.load_data(file_name=self.test_japanese_wiki_data) # noqa self.test_prepare_train_data = PrepareTrainData() test_train_data = \ self.test_prepare_train_data.prepare_train_data_method( window_data=test_window_data, word2index=test_word2index, weighting_dic=test_weightinhg_dict, X_ik=test_X_ik) APP_PATH = os.path.dirname(__file__) output_file = APP_PATH + '/test_data/train_data.pkl' compare_output_file = APP_PATH + '/test_data/test_train_data.pkl' with open(output_file, 'wb') as handle: pickle.dump(test_train_data, handle) assert True is filecmp.cmp(output_file, compare_output_file)
optimizer = checkpoint["optimizer"] else: print("==> Building model...") net = attrWCNNg(num_attr=312, num_classes=NUM_CLASSES) # print(torch_summarize(net)) # print(net) if USE_GPU: net.cuda() # net = torch.nn.DataParallel(net.module, device_ids=range(torch.cuda.device_count())) cudnn.benchmark = True log = open("./log/" + MODEL_NAME + '_cub.txt', 'a') print("==> Preparing data...") data_loader = DataLoader(data_dir=args.data, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE) inputs, classes = next(iter(data_loader.load_data())) # out = torchvision.utils.make_grid(inputs) # data_loader.show_image(out, title=[data_loader.data_classes[c] for c in classes]) train_loader = data_loader.load_data(data_set='train') test_loader = data_loader.load_data(data_set='val') # criterion = nn.CrossEntropyLoss() criterion = RegLoss(lamda1=lamda1, lamda2=lamda2, superclass="cub") # criterion = FocalLoss(class_num=NUM_CLASSES, gamma=0) # def one_hot_emb(batch, depth=NUM_CLASSES): # emb = nn.Embedding(depth, depth) # emb.weight.data = torch.eye(depth) # return emb(batch).data def one_hot_emb(y, depth=NUM_CLASSES): y = y.view((-1, 1))
import pandas as pd from data.data_loader import DataLoader data_loader = DataLoader() # Load files files = ["orders", "order_products"] data = data_loader.load_data(files) # Get orders and user_ids to predict (test) orders = data["orders"] test = orders[orders["eval_set"] == "test"] test_uids = test["user_id"] orders_prior = orders[(orders["eval_set"] == "prior") & (orders["user_id"].isin(test_uids))] # Get products of prior orders products = data["order_products"] products_prior = products[products["order_id"].isin(orders_prior["order_id"])] # Get order_id of last order per user orders_prior_ids = orders_prior.groupby("user_id")["order_number"].idxmax() last_order_ids = orders_prior.loc[orders_prior_ids]["order_id"] # Aggregate all products of same order to a list and select last orders products_prior_list = pd.DataFrame( products_prior.groupby('order_id')['product_id'].apply(list)) products_last_order = products_prior_list.loc[last_order_ids] # Merge to get user_id and list of product_ids
def gzsl_test0(epoch, net, optimizer, log, gamma=2.): NUM_CLASSES = 50 # set the number of classes in your dataset num_seen_classes = 40 NUM_ATTR = 85 DATA_DIR = "/home/elvis/data/attribute/AwA/Animals_with_Attributes2/zsl/gzsl_test" BATCH_SIZE = 32 IMAGE_SIZE = 224 best_h = 55 USE_GPU = torch.cuda.is_available() data_loader = DataLoader(data_dir=DATA_DIR, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE) # train_loader = data_loader.load_data(data_set='train') test_loader = data_loader.load_data(data_set='val') criterion = nn.CrossEntropyLoss() net.eval() test_loss, correct_seen, correct_unseen, total_seen, total_unseen, total, loss = 0, 0, 0, 0, 0, 0, 0 for batch_idx, (inputs, targets) in enumerate(test_loader): if USE_GPU: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) out, attr = net(inputs) loss = criterion(out, targets) test_loss = loss.data[0] logit = out.data seen_prob, seen_class = torch.max(logit[:, :num_seen_classes], 1) unseen_prob, unseen_class = torch.max(logit[:, num_seen_classes:], 1) predicted = seen_class for i, spi in enumerate(seen_prob): if seen_prob[i] < unseen_prob[i] * gamma: predicted[i] = unseen_class[i] + num_seen_classes total += targets.size(0) correct_list = predicted.eq(targets.data).cpu() target_list = targets.data.cpu() for i, targeti in enumerate(target_list): if targeti < num_seen_classes: correct_seen += correct_list[i] total_seen += 1 else: correct_unseen += correct_list[i] total_unseen += 1 acc_seen = 100. * correct_seen / total_seen if total_unseen > 0: acc_unseen = 100. * correct_unseen / total_unseen else: acc_unseen = 0. progress_bar( batch_idx, len(test_loader), 'Loss: %.3f | acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen)) acc_seen = 100. * correct_seen / total_seen acc_unseen = 100. * correct_unseen / total_unseen h = 2. / (1. / acc_seen + 1. / acc_unseen) print("acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d) | H: %.3f%%" % (acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen, h)) log.write(str(acc_seen) + ' ' + str(acc_unseen) + ' ' + str(h) + " ") if h > best_h: MODEL_SAVE_FILE = "gzsl_awa2_epoch%dacc%d.pth" % (epoch, int(h)) print(MODEL_SAVE_FILE) state = {'net': net, 'acc': h, 'epoch': epoch, 'optimizer': optimizer} torch.save(state, "./checkpoints/" + MODEL_SAVE_FILE)
def gzsl_test(epoch, net, optimizer): NUM_CLASSES = 50 # set the number of classes in your dataset num_seen_classes = 40 NUM_ATTR = 85 DATA_DIR = "/home/elvis/data/attribute/AwA/Animals_with_Attributes2/zsl/gzsl_test" BATCH_SIZE = 32 IMAGE_SIZE = 224 best_h = 40 USE_GPU = torch.cuda.is_available() # order_awa2_attr = np.load("data/order_awa2_attr.npy") # w_attr_sum = np.sum(w_attr, 0) # w_attr = w_attr/w_attr_sum # w_attr[:, 0].sum() # order_awa2_attr = torch.FloatTensor(order_awa2_attr / 100.).cuda() # 50 * 312 data_loader = DataLoader(data_dir=DATA_DIR, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE) # train_loader = data_loader.load_data(data_set='train') test_loader = data_loader.load_data(data_set='val') criterion = nn.CrossEntropyLoss() net.eval() test_loss, correct_seen, correct_unseen, total_seen, total_unseen, total, loss = 0, 0, 0, 0, 0, 0, 0 for batch_idx, (inputs, targets) in enumerate(test_loader): if USE_GPU: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) out, attr = net(inputs) loss = criterion(out, targets) test_loss = loss.data[0] _, predicted = torch.max(out.data, 1) total += targets.size(0) correct_list = predicted.eq(targets.data).cpu() target_list = targets.data.cpu() for i, targeti in enumerate(target_list): if targeti < 40: correct_seen += correct_list[i] total_seen += 1 else: correct_unseen += correct_list[i] total_unseen += 1 acc_seen = 100. * correct_seen / total_seen if total_unseen > 0: acc_unseen = 100. * correct_unseen / total_unseen else: acc_unseen = 0. progress_bar( batch_idx, len(test_loader), 'Loss: %.3f | acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen)) acc_seen = 100. * correct_seen / total_seen acc_unseen = 100. * correct_unseen / total_unseen h = 2. / (1. / acc_seen + 1. / acc_unseen) print("acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d) | H: %.3f%%" % (acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen, h)) if h > best_h: MODEL_SAVE_FILE = "gzsl_awa2_epoch%dacc%d.pth" % (epoch, int(h)) print(MODEL_SAVE_FILE) state = {'net': net, 'acc': h, 'epoch': epoch, 'optimizer': optimizer} torch.save(state, "./checkpoints/" + MODEL_SAVE_FILE)
def zsl_test(epoch, net, optimizer, log): NUM_CLASSES = 50 # set the number of classes in your dataset NUM_SEEN = 40 NUM_UNSEEN = NUM_CLASSES - NUM_SEEN NUM_ATTR = 85 DATA_DIR = "/home/elvis/data/attribute/AwA/Animals_with_Attributes2/zsl/zsl_test" BATCH_SIZE = 32 IMAGE_SIZE = 224 best_acc = 74 USE_GPU = torch.cuda.is_available() order_awa2_attr = np.load("data/order_awa2_attr.npy") # w_attr_sum = np.sum(w_attr, 0) # w_attr = w_attr/w_attr_sum # w_attr[:, 0].sum() order_awa2_attr = order_awa2_attr[NUM_SEEN:, :] order_awa2_attr = torch.FloatTensor(order_awa2_attr / 100.).cuda() # 50 * 312 net.fc2 = nn.Linear(NUM_ATTR, NUM_CLASSES, bias=False) net.fc2.weight = nn.Parameter(order_awa2_attr, requires_grad=False) # print(torch_summarize(net)) # print(net) net.cuda() data_loader = DataLoader(data_dir=DATA_DIR, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE) train_loader = data_loader.load_data(data_set='train') test_loader = data_loader.load_data(data_set='val') criterion = nn.CrossEntropyLoss() net.eval() test_loss, correct, total, loss = 0, 0, 0, 0 correct_bin = np.zeros(NUM_UNSEEN) total_bin = np.zeros(NUM_UNSEEN) for batch_idx, (inputs, targets) in enumerate(test_loader): if USE_GPU: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) out, attr = net(inputs) loss = criterion(out, targets) test_loss = loss.data[0] _, predicted = torch.max(out.data, 1) total += targets.size(0) correct += predicted.eq(targets.data).cpu().sum() correct_list = predicted.eq(targets.data).cpu() target_list = targets.data.cpu() for i, targeti in enumerate(target_list): correct_bin[targeti] += correct_list[i] total_bin[targeti] += 1. acc = 100. * correct / total progress_bar( batch_idx, len(test_loader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), acc, correct, total)) acc = 100. * correct / total acc_bin = 100. * correct_bin / total_bin np.save("data/sun_acc_bin.npy", acc_bin) print("ZSL acc_per_class: %.3f%%(%d/%d)" % (np.mean(acc_bin), correct_bin[0], total_bin[0])) log.write(str(np.mean(acc_bin)) + ' ') if acc > best_acc: MODEL_SAVE_FILE = "zsl_resnet18_awa2_epoch%dacc%d.pth" % (epoch, int(acc)) print(MODEL_SAVE_FILE) state = { 'net': net, 'acc': acc, 'epoch': epoch, 'optimizer': optimizer } torch.save(state, "./checkpoints/" + MODEL_SAVE_FILE)
def gzsl_test(epoch, net, optimizer): NUM_CLASSES = 717 # set the number of classes in your dataset Num_SEEN = 645 NUM_ATTR = 102 DATA_DIR = "/home/elvis/data/attribute/SUN/zsl/gzsl_test" BATCH_SIZE = 32 IMAGE_SIZE = 224 best_h = 50 USE_GPU = torch.cuda.is_available() order_sun_attr = np.load("data/order_sun_attr.npy") # order_sun_attr[Num_SEEN:, :] = order_sun_attr[Num_SEEN:, :] # order_cub_attr = order_cub_attr[150:, :] order_sun_attr = torch.FloatTensor(order_sun_attr).cuda() # 50 * 312 net.fc2 = nn.Linear(NUM_ATTR, NUM_CLASSES, bias=False) net.fc2.weight = nn.Parameter(order_sun_attr, requires_grad=False) net.cuda() data_loader = DataLoader(data_dir=DATA_DIR, image_size=IMAGE_SIZE, batch_size=BATCH_SIZE) # train_loader = data_loader.load_data(data_set='train') test_loader = data_loader.load_data(data_set='val') criterion = nn.CrossEntropyLoss() net.eval() test_loss, correct_seen, correct_unseen, total_seen, total_unseen, total, loss = 0, 0, 0, 0, 0, 0, 0 for batch_idx, (inputs, targets) in enumerate(test_loader): if USE_GPU: inputs, targets = inputs.cuda(), targets.cuda() inputs, targets = Variable(inputs, volatile=True), Variable(targets) out, attr = net(inputs) loss = criterion(out, targets) test_loss = loss.data[0] _, predicted = torch.max(out.data, 1) total += targets.size(0) correct_list = predicted.eq(targets.data).cpu() target_list = targets.data.cpu() for i, targeti in enumerate(target_list): if targeti < Num_SEEN: correct_seen += correct_list[i] total_seen += 1 else: correct_unseen += correct_list[i] total_unseen += 1 acc_seen = 100. * correct_seen / total_seen if total_unseen > 0: acc_unseen = 100. * correct_unseen / total_unseen else: acc_unseen = 0. progress_bar( batch_idx, len(test_loader), 'Loss: %.3f | acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d)' % (test_loss / (batch_idx + 1), acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen)) acc_seen = 100. * correct_seen / total_seen acc_unseen = 100. * correct_unseen / total_unseen h = 2. / (1. / acc_seen + 1. / acc_unseen) print("acc_seen: %.3f%% (%d/%d) | acc_unseen: %.3f%% (%d/%d) | H: %.3f%%" % (acc_seen, correct_seen, total_seen, acc_unseen, correct_unseen, total_unseen, h)) if h > best_h: MODEL_SAVE_FILE = "gzsl_resnet50_sun_epoch%dacc%d.pth" % (epoch, int(h)) print(MODEL_SAVE_FILE) state = {'net': net, 'acc': h, 'epoch': epoch, 'optimizer': optimizer} torch.save(state, "./checkpoints/" + MODEL_SAVE_FILE)
def get_qualified_shops(self, threshold=0.07): qualified_shops = [shop_id for shop_id in range(1, 2001) if self.get_best_loss(shop_id) <= threshold] return qualified_shops def get_unqualified_shops(self, threshold=0.07): unqualified_shops = [shop_id for shop_id in range(1, 2001) if self.get_best_loss(shop_id) > threshold] return unqualified_shops def get_mean_loss(self, threshold=0.07): all_loss = [self.get_best_loss(shop_id) for shop_id in range(1, 2001) if self.get_best_loss(shop_id) < threshold] return np.mean(all_loss) if __name__ == '__main__': loader_data = DataLoader.load_data() shop_info = loader_data['shop_info'] user_pay = loader_data['user_pay'] user_view = loader_data['user_view'] shop_ids = range(1, 2001) ordinary_dates = pd.date_range(start='2015-10-10', end='2016-10-31', freq='D').strftime('%Y-%m-%d') business_dates = pd.date_range(start='2015-10-10', end='2016-10-31', freq='B').strftime('%Y-%m-%d') result = {} for shop_id in shop_ids[:1]: shop_info_data = shop_info.iloc[shop_id - 1].to_dict() user_pay_data = user_pay[user_pay.shop_id == shop_id] user_pay_data = user_pay_data.sort_values(by='time_stamp') user_pay_data['date'] = user_pay_data['time_stamp'].apply(lambda x: x[:10]) user_pay_info = UserPayInfoBase(shop_id, shop_info_data, ordinary_dates) user_pay_info.set_flow(user_pay_data, ordinary_dates) result[shop_id] = user_pay_info
LEARNING_RATE = 0.001 REGULARIZATION_RATE = 1 embedding_dim = 100 logs_path = "/app/tmp/logs/20/" data_root = "/app/data/datasets/amazon-fine-food-reviews/" train_filename = "train_Reviews" test_filename = "test_Reviews" valid_filename = "valid_Reviews" print("Loading data...") train_data = DataLoader(data_root, train_filename, NUM_EPOCHS, BATCH_SIZE, "Text", "Score") train_data.load_data() valid_data = DataLoader(data_root, valid_filename, NUM_EPOCHS, BATCH_SIZE, "Text", "Score") valid_data.load_data() model = CLSTMModel(num_classes=NUM_CLASSES, embedding_dim=embedding_dim, sequence_len=train_data.sequence_len) sess = tf.Session() x = tf.placeholder(tf.int32, [None, train_data.sequence_len], name="x") y = tf.placeholder(tf.int32, [None, 1], name="y") lengths = tf.placeholder(tf.int32, [None]) keep_prob = tf.placeholder(tf.float32) oh_y = tf.squeeze(tf.one_hot(y, depth=NUM_CLASSES, name='oh_y'))