示例#1
0
def main():
    opt = parse_option()
    train_loader, n_data = set_loader(opt)
    model = set_model(opt, train_loader, n_data)
    optimizer, lr_scheduler = set_optimizer(opt, model, len(train_loader))

    writer = SummaryWriter(logdir=opt.tb_folder)

    for epoch in range(1, opt.epochs + 1):
        end = time.time()
        loss_byol = train(train_loader, model, optimizer, lr_scheduler, epoch,
                          opt)
        print('epoch {}, total time {:.2f}s'.format(epoch, time.time() - end))

        writer.add_scalar('train loss', loss_byol, epoch)
        writer.add_scalar('learning_rate', lr_scheduler.get_lr(), epoch)

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model.model, optimizer, opt, epoch, save_file)

    # save the last model
    save_file = os.path.join(opt.save_folder, 'last.pth')
    save_model(model.model, optimizer, opt, opt.epochs, save_file)
示例#2
0
def get_we_matrix_wi_encode_docs_w_fasttext(ftext_model_path,
                                            docs_text_main_folder,
                                            encoded_out_folder_docs):
    f = load_model(ftext_model_path)
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(docs_text_main_folder)):
        fp = os.path.join(docs_text_main_folder, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())
    stoplist = load_indri_stopwords()

    print('encoding collection')
    encoded_docs_by_name = {}
    wi = {}
    we_matrix = []
    for dn, dt in tqdm(text_by_name.items()):
        tok_doc = util.tokenize(dt, stemming=False, stoplist=stoplist)
        encoded_doc = []
        for tok in tok_doc:
            if tok not in wi.keys():
                wv = f.get_word_vector(tok)
                wi[tok] = len(wi)
                we_matrix.append(wv)
            encoded_doc.append(wi[tok])
        util.save_model(encoded_doc, os.path.join(encoded_out_folder_docs, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name, wi, we_matrix
示例#3
0
def run_model(which_task, model_params, train_iter, valid_iter, test_iter, save=False, model_file=''):
    '''
    Trains single model w/ given parameters and evaluates on test

    Takes:
    - string denoting which field is label ("response" or "product")
    - dict of model parameters
    - train iterable of batches
    - validation iterable of batches
    - test iterable of batches
    - filename for model state dict (in models subdirectory)
    - boolean to turn off saving model state dict
    '''

    best_model, train_time = optimize_params(model_params, train_iter, valid_iter)
    
    # compute loss on test set
    test_loss = best_model.evaluate(test_iter, BATCH_SIZE)
    print("Loss of best model on testing set:", test_loss)

    # save state
    if save:
        optimized_dict = best_model.state_dict()
    
        try:
            util.save_model(optimized_dict, model_file)
        except Exception as e:
            print(e)
            return best_model, train_data, test_loss   

    return best_model, train_time, test_loss
def postprocess(dataset, model, noise_type, noise_ratio, folders,
                y_test_noisy):
    log_dir = folders['logdir']
    loss, acc = model.evaluate(dataset.x_test, dataset.y_test, verbose=0)
    print('loss:', loss, '- acc:', acc)

    # calculate similarity ofgiven confusion matrix and output confusion matrix
    pred = model.predict(dataset.x_test)
    pred_int = np.argmax(pred, axis=1)
    sim = 1 - distance.cosine(pred_int, y_test_noisy)
    print('Similarity is', sim)
    # plot confusion matrix
    plot_cm(model,
            dataset.x_test,
            dataset.y_test_int(),
            dataset.class_names,
            log_dir + '/cm.png',
            title='acc({}), similarity({})'.format(round(acc, 3),
                                                   round(sim, 2)))
    # plot accuracies and losses for all models
    base_folder = folders['logbase_nr']
    plot_overall(base_folder)
    # save variables
    np.save(log_dir + 'preds.npy', pred_int)
    save_model(model, log_dir + 'model/model')
def save_model_outputs(model, _dataset, model_path):
    npy_path = model_path + 'npy/'
    create_folders(npy_path, model_path + 'model/')
    model_soft = Model(model.input, model.get_layer('features').output)
    # save softmax predictions
    pred = model.predict(_dataset.x_train)[:, :_dataset.num_classes]
    pred_int = np.argmax(pred, axis=1)
    np.save(npy_path + 'train_preds.npy', pred)
    np.save(npy_path + 'train_preds_int.npy', pred_int)
    pred = model.predict(_dataset.x_test)[:, :_dataset.num_classes]
    pred_int = np.argmax(pred, axis=1)
    np.save(npy_path + 'test_preds.npy', pred)
    np.save(npy_path + 'test_preds_int.npy', pred_int)
    # save logits
    logits_train = model_soft.predict(
        _dataset.x_train)[:, :_dataset.num_classes]
    logits_test = model_soft.predict(_dataset.x_test)[:, :_dataset.num_classes]
    np.save(npy_path + 'train_logits.npy', logits_train)
    np.save(npy_path + 'test_logits.npy', logits_test)
    # save confusion matrices
    cm_train = plot_cm(model, _dataset.x_train, _dataset.y_train_int(),
                       _dataset.class_names, model_path + 'train_cm.png')
    cm_test = plot_cm(model, _dataset.x_test, _dataset.y_test_int(),
                      _dataset.class_names, model_path + 'test_cm.png')
    np.save(npy_path + 'train_cm.npy', cm_train)
    np.save(npy_path + 'test_cm.npy', cm_test)
    # save distance matrices
    plot_dm(model_soft, _dataset.x_train, _dataset.y_train_int(),
            _dataset.class_names, model_path + 'train_dm.png')
    plot_dm(model_soft, _dataset.x_test, _dataset.y_test_int(),
            _dataset.class_names, model_path + 'test_dm.png')
    # save model
    plot_model(model, model_path + 'model/model.png')
    save_model(model, model_path + 'model/model')
    K.clear_session()
示例#6
0
文件: main.py 项目: VamosC/MNIST
def train():
    images, labels = process_data('./data/train-images-idx3-ubyte',
                                  './data/train-labels-idx1-ubyte')
    train_set = Mnist(images, labels)
    # train_loader = DataLoader(train_set, batch_size=64,
    #                           shuffle=True, num_workers=8, pin_memory=True)
    train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
    model = Convnet()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=20,
                                                   gamma=0.5)
    aver = Averager()
    for epoch in range(1, 11):
        lr_scheduler.step()
        model.train()
        for i, batch in enumerate(train_loader, 1):
            # image, label = [_.cuda() for _ in batch]
            image, label = batch
            score = model(image)
            loss = F.cross_entropy(score, label.long())
            acc = count_acc(score, label, aver)
            print('epoch %d batch %d acc: %f' % (epoch, i, acc))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print('epoch %d acc: %f' % (epoch, aver.item()))
    save_model(model, 'model-1')
def compute_inverted_index(coll_folder, stemming, output_file_path_ii):
    if not os.path.isfile(output_file_path_ii):
        print('computing inverted index')
        inverted_idx = {}
        sw = util.load_indri_stopwords()
        doc_n = 0
        for filename in tqdm(os.listdir(coll_folder)):
            fp = os.path.join(coll_folder, filename)
            doc_id = filename.split(r'.')[0]
            if os.path.isfile(fp):
                doc_n += 1
                d = util.tokenize(' '.join(open(fp, 'r').readlines()),
                                  stemming,
                                  stoplist=sw)
                set_w_in_doc = set(d)
                for w in set_w_in_doc:
                    if w in inverted_idx.keys():
                        inverted_idx[w].append((doc_id, d.count(w)))
                    else:
                        inverted_idx[w] = [(doc_id, d.count(w))]

        util.save_model(inverted_idx, output_file_path_ii)
    else:
        inverted_idx = util.load_model(output_file_path_ii)
    return inverted_idx
示例#8
0
def main():
    model = None
    if config.model == 'Baseline':
        model = models.Baseline()
    if config.train:
        # Can train with existing weights (if config.restart = True, will use most recent by default)
        weights_path = None
        if not config.restart:
            weights_path = os.path.join(
                config.model_save_dir,
                get_recent_weights_path(config.model_save_dir))
        model.build(weights_path)
        # train(model)
        simple_train(model)
        if config.save_model:
            save_model(model,
                       config.model_save_path,
                       config.model_weights_save_path,
                       ext=".h5")
    else:
        # If we only care about predicting!
        # Make sure there are trained weights (most recent will be used by default)
        weights_path = os.path.join(
            config.model_save_dir,
            get_recent_weights_path(config.model_save_dir))
        model.build(weights_path)
        # pred(model)
        simple_pred(model)
示例#9
0
def train(starting_epoch, model, optimizer, scheduler, criterion, trainer, evaluator, ENV):
    for epoch in range(starting_epoch, config.epochs):
        logger.info("="*20 + "Training" + "="*20)

        # Train
        ENV['global_step'] = trainer.train(epoch, ENV['global_step'], model, optimizer, criterion)
        scheduler.step()

        # Eval
        logger.info("="*20 + "Eval" + "="*20)
        evaluator.eval(epoch, ENV['global_step'], model, torch.nn.CrossEntropyLoss())
        payload = ('Eval Loss:%.4f\tEval acc: %.2f' % (evaluator.loss_meters.avg, evaluator.acc_meters.avg*100))
        logger.info(payload)
        ENV['train_history'].append(trainer.acc_meters.avg*100)
        ENV['eval_history'].append(evaluator.acc_meters.avg*100)
        ENV['curren_acc'] = evaluator.acc_meters.avg*100
        ENV['best_acc'] = max(ENV['curren_acc'], ENV['best_acc'])

        # Reset Stats
        trainer._reset_stats()
        evaluator._reset_stats()

        # Save Model
        target_model = model.module if args.data_parallel else model
        util.save_model(ENV=ENV,
                        epoch=epoch,
                        model=target_model,
                        optimizer=optimizer,
                        scheduler=scheduler,
                        filename=checkpoint_path_file)
        logger.info('Model Saved at %s', checkpoint_path_file)
    return
示例#10
0
def pre_compute_test_fd_pwe(qbn, dbn, w, run_to_rerank, max_q_len, max_d_len,
                            fold, output_folder):
    # test_fd = []
    test_fd_fp = os.path.join(output_folder, 'test_fd_' + str(fold))
    if not os.path.isfile(test_fd_fp):
        d_t_rerank_by_query = compute_docs_to_rerank_by_query(
            run_to_rerank, qbn.keys())
        for q_name in tqdm(qbn.keys()):
            if os.path.isfile(test_fd_fp + '_' + q_name):
                continue
            if q_name not in d_t_rerank_by_query.keys():
                continue
            d_names = d_t_rerank_by_query[q_name]
            docs = [dbn[dn] for dn in d_names if dn in dbn.keys()]
            d_lengths = [len(d) for d in docs]
            # padded_d = [pad(d, padding_value, max_d_len) for d in docs]
            q = qbn[q_name]
            all_sim_m = []
            d_batch = []
            for d in docs:
                d_batch.append(d)
                if len(d_batch) == 16:
                    all_sim_m.extend(
                        parallel_compute_sim_matrices([q] * len(d_batch),
                                                      d_batch, max_q_len,
                                                      max_d_len, w))
                    d_batch = []
            # test_fd.append(([len(qbn[q_name])] * len(docs), d_lengths, d_names, q_name, all_sim_m))
            util.save_model(([len(qbn[q_name])] * len(docs), d_lengths,
                             d_names, q_name, all_sim_m),
                            test_fd_fp + '_' + q_name)
def read_collection(coll_main_folder,
                    output_model_path,
                    stemming,
                    stoplist=None):
    if not os.path.isfile(output_model_path):
        if stoplist is None:
            stoplist = util.load_indri_stopwords()
        text_by_name = {}
        print('reading files in folder')
        pool = multiprocessing.Pool(8)
        fnames_list = os.listdir(coll_main_folder)
        doc_paths_list = [
            os.path.join(coll_main_folder, filename)
            for filename in fnames_list
        ]
        print('processing collection')
        tokenized_docs = pool.starmap(
            util.tokenize,
            [(' '.join(open(fp, 'r').readlines()), stemming, stoplist)
             for fp in doc_paths_list])

        for i in range(len(fnames_list)):
            text_by_name[fnames_list[i].split(r'.')[0]] = tokenized_docs[i]

        print('saving model')
        util.save_model(text_by_name, output_model_path)
    else:
        print('loading model: %s' % output_model_path)
        text_by_name = util.load_model(output_model_path)
    return text_by_name
示例#12
0
def encode_collection(text_by_name_p, word_dict_path, encoded_out_folder):
    # word_dict_path = '/media/alberto/DATA/BaiduNetdiskDownload/data/word_dict.txt'
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(text_by_name_p)):
        fp = os.path.join(text_by_name_p, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())
    print('reading word2vec model')
    encoded_docs_by_name = {}
    wi = {}
    for line in tqdm(open(word_dict_path)):
        data = line.split()
        word = data[0].strip()
        wid = int(data[1].strip())
        if word not in wi.keys():
            wi[word] = wid
    sw = load_indri_stopwords()
    print('encoding data')
    for dn, dc in tqdm(text_by_name.items()):
        td = util.tokenize(dc, stemming=False, stoplist=sw)
        encoded_doc = [wi[w] for w in td if w in wi.keys()]
        util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name
示例#13
0
def compute_docs_to_rerank(dbn, qbn, we, gt_path):
    print('computing relevant docs by query')
    rel_docs_by_query = du.get_rel_docs_by_qry(gt_path)
    print('computing document representations')
    dbn_means = {}
    for k, v in tqdm(dbn.items()):
        if len(v) == 0:
            mean = np.zeros(50)
            dbn_means[k] = mean
        else:
            mean = np.mean([we[w] for w in v], axis=0)
            dbn_means[k] = mean / np.linalg.norm(mean)
    print('computing queries representations')
    qbn_means = {}
    for k, v in tqdm(qbn.items()):
        if len(v) == 0:
            mean = np.zeros(50)
            dbn_means[k] = mean
        else:
            mean = np.mean([we[w] for w in v], axis=0)
            qbn_means[k] = mean / np.linalg.norm(mean)

    doc_names = list(dbn_means.keys())
    doc_names = np.array(doc_names)
    print('computing rankings')
    sorted_d_names_by_query = {}
    incremental_n_rel_docs_by_query = {}
    for qn, q in tqdm(qbn_means.items()):
        if qn not in rel_docs_by_query.keys():
            continue
        dists = [-1] * len(doc_names)
        for i in range(len(doc_names)):
            dn = doc_names[i]
            bonus = np.sum([10 for w in qbn[qn] if w in dbn[dn]])
            dists[i] = np.dot(dbn_means[dn], q) + bonus
        sorted_indices = np.argsort(-np.array(dists))
        sorted_dnames = doc_names[sorted_indices]
        sorted_d_names_by_query[qn] = sorted_dnames[0:8000]
        incremental_n_rel_docs_by_query[qn] = []
        rel_cnt = 0
        for i in range(len(sorted_dnames)):
            dn = sorted_dnames[i]
            if dn in rel_docs_by_query[qn]:
                rel_cnt += 1
            incremental_n_rel_docs_by_query[qn].append(rel_cnt)

    util.save_model(sorted_d_names_by_query, 'sorted_d_names_by_query.model')

    merged_incremental_rel_cnt = np.zeros(len(doc_names))
    # util.save_model(sorted_d_names_by_query, 'sorted_d_names_by_query_w_bonus.model')
    # util.save_model(merged_incremental_rel_cnt, 'merged_incremental_rel_cnt_w_bonus.model')
    print('preparing plot data')
    for q, cnts in tqdm(incremental_n_rel_docs_by_query.items()):
        for i in range(len(cnts)):
            merged_incremental_rel_cnt[i] += cnts[i]

    out = open('log.txt', 'w')
    for i in merged_incremental_rel_cnt:
        out.write(str(i) + '\n')
    out.close()
示例#14
0
def main():
    for epoch in range(args.n_epoch):
        print('\n\n-------------------------------------------')
        print('Epoch-{}'.format(epoch))
        print('-------------------------------------------')

        model.train()

        train_iter = tqdm(enumerate(dataset.train_iter()))
        train_iter.set_description_str('Training')

        for it, mb in train_iter:
            output = model(mb.context, mb.response)
            loss = F.binary_cross_entropy_with_logits(output, mb.label)

            loss.backward()
            # clip_gradient_threshold(model, -10, 10)
            solver.step()
            solver.zero_grad()

            if it > 0 and it % 1000 == 0:
                # Validation
                recall_at_ks = eval_model(model, dataset.valid_iter(),
                                          max_seq_len, max_seq_len, args.gpu)

                print(
                    'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}'
                    .format(loss.data[0], recall_at_ks[0], recall_at_ks[1],
                            recall_at_ks[4]))

        save_model(model, 'ccn_lstm')
示例#15
0
文件: solver.py 项目: Fhrozen/nyural
 def solve(self):
     cnf = self.cnf
     print cnf.initlambda
     self.ff.run_batches(self.inputs, self.targets, 
                         optimizer= HessianFree(CG_iter=250,init_damping=cnf.initlambda), 
                         batch_size=7500, test=self.test, max_epochs=cnf.max_epoch, 
                         plotting=True)
     self.logger.info("Optimization Done")
     util.save_model(self.cnf, self.ff)
示例#16
0
def main():
    best_acc = 0
    opt = parse_option()

    # build data loader
    train_loader = set_loader(opt)
    _, val_loader = set_val_loader(opt)

    # build model and criterion
    model, classifier, criterions = set_model(opt)

    # build optimizer
    parameters = list(model.parameters()) + list(classifier.parameters())
    optimizer = set_optimizer(opt, parameters)

    # tensorboard
    writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2)

    # build memory banks
    memory_banks = {
        'labeled': ReserviorMemory(opt.labeled_memory_capacity),
        'unlabeled': ReserviorMemory(opt.unlabeled_memory_capacity)
    }

    # training routine
    for epoch in range(1, opt.epochs + 1):

        # train for one epoch
        time1 = time.time()
        loss = train(train_loader, memory_banks, model, classifier, criterions,
                     optimizer, epoch, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # eval for one epoch
        if epoch % opt.val_freq == 0 or epoch == opt.epochs:
            loss, val_acc = validate(val_loader, model, classifier,
                                     criterions['CrossEntropyLoss'], opt)
            if val_acc > best_acc:
                best_acc = val_acc

        # tensorboard logger
        writer.add_scalar('loss', loss, global_step=epoch)
        writer.add_scalar('learning_rate',
                          optimizer.param_groups[0]['lr'],
                          global_step=epoch)

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model, optimizer, opt, epoch, save_file)

    print('best accuracy: {:.2f}'.format(best_acc))

    # save the last model
    save_file = os.path.join(opt.save_folder, 'last.pth')
    save_model(model, optimizer, opt, opt.epochs, save_file)
示例#17
0
def compute_training_pairs_w_queries_variations(fold_idx, coll, n_iter_per_query, gt_file, dbn, qbn, ftt_model, iwi,
                                                wi):
    model_name = 'qn_rd_nrd_pairs_w2v_gk_' + str(fold_idx) + '_' + str(coll) + '_' + str(n_iter_per_query)
    if not os.path.isfile(model_name):
        rd_b_qry = {}
        nrd_by_qry = {}
        for line in open(gt_file):
            data = line.split()
            qname = data[0].strip()
            dname = data[2].strip()
            if dname not in dbn.keys():
                continue
            rj = int(data[3].strip())
            if qname not in rd_b_qry.keys():
                rd_b_qry[qname] = []
                nrd_by_qry[qname] = []

            if rj > 0:
                rd_b_qry[qname].append(dname)
            else:
                nrd_by_qry[qname].append(dname)
        test_q_names = list(qbn.keys())
        np.random.shuffle(test_q_names)

        qn_rd_nrd_pairs = []
        for qn in test_q_names:
            if qn not in rd_b_qry.keys():
                continue

            # add training examples with original query:
            encoded_q = qbn[qn]
            tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True)
            tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True)
            for i in range(n_iter_per_query):
                qn_rd_nrd_pairs.append((encoded_q, dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]]))
            print('original query: ' + ' '.join([iwi[w] for w in encoded_q]))
            # add extra training examples
            for i in range(len(encoded_q)):
                encoded_q_variation = encoded_q
                curr_q_word = iwi[encoded_q[i]]
                similar_words = get_synonyms(curr_q_word, ftt_model)
                for sw in similar_words:
                    sw = util.stem(sw)
                    if sw in wi.keys() and curr_q_word != sw:
                        print('word = ' + curr_q_word + ', substitute = ' + sw)
                        encoded_q_variation[i] = wi[sw]
                        print('alternative query: ' + ' '.join([iwi[w] for w in encoded_q_variation]))
                        tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True)
                        tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True)
                        for j in range(n_iter_per_query):
                            qn_rd_nrd_pairs.append((encoded_q_variation, dbn[tmp_rdocs[j]], dbn[tmp_nrdocs[j]]))

        np.random.shuffle(qn_rd_nrd_pairs)
        util.save_model(qn_rd_nrd_pairs, model_name)
    else:
        qn_rd_nrd_pairs = util.load_model(model_name)
    return qn_rd_nrd_pairs
示例#18
0
def main():
    best_val = 0.0
    for epoch in range(args.n_epoch):
        print('\n\n-------------------------------------------')
        print('Epoch-{}'.format(epoch))
        print('-------------------------------------------')

        model.train()

        train_iter = enumerate(udc.get_iter('train'))

        if not args.no_tqdm:
            train_iter = tqdm(train_iter)
            train_iter.set_description_str('Training')
            train_iter.total = udc.n_train // udc.batch_size

        for it, mb in train_iter:
            #context, response, y, cm, rm, ql = mb
            context, response, y, cm, rm, ql, key_r, key_mask_r = mb
            output = model(context, response, cm, rm, key_r, key_mask_r)
            #output = model(context, response, cm, rm)
            #output = model(context, response)
            loss = F.binary_cross_entropy_with_logits(output, y)
            # loss = F.mse_loss(F.sigmoid(output), y)

            loss.backward()
            #print (model.conv3.grad)
            #clip_gradient_threshold(model, -10, 10)
            solver.step()
            solver.zero_grad()

        del (context, response, y, output)
        # Validation
        recall_at_ks = eval_model_v2(model,
                                     udc,
                                     'valid',
                                     gpu=args.gpu,
                                     no_tqdm=args.no_tqdm)

        print(
            'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}'
            .format(loss.data[0], recall_at_ks[0], recall_at_ks[1],
                    recall_at_ks[4]))
        recall_1 = recall_at_ks[0]
        # if epoch > 10:
        #     eval_test()

        if best_val == 0.0:
            save_model(model, model_name)
            best_val = recall_1
        else:
            if recall_1 > best_val:
                best_val = recall_1
                print("Saving model for recall@1:" + str(recall_1))
                save_model(model, model_name)
            else:
                print("Not saving, best accuracy so far:" + str(best_val))
示例#19
0
def encode_collection_with_stemming(text_by_name_p,
                                    word_dict_path,
                                    w2v_model_path,
                                    encoded_out_folder,
                                    wi=None,
                                    word_embeddings_matrix=None):
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(text_by_name_p)):
        fp = os.path.join(text_by_name_p, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())

    # initialize embeddings matrix
    if word_embeddings_matrix is None:
        # read and adapt word index
        if wi is None:
            wi = {}
            wids_to_merge = {}
            for line in tqdm(open(word_dict_path)):
                data = line.split()
                word_stemmed = util.stem(data[0].strip())
                wid = int(data[1].strip())
                if word_stemmed not in wi.keys():
                    wi[word_stemmed] = len(wi)
                    wids_to_merge[word_stemmed] = [wid]
                else:
                    wids_to_merge[word_stemmed].append(wid)
        we_size = 50
        word_embeddings_matrix = np.float32(
            np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size]))
        padding_value = np.zeros(we_size)
        word_embeddings_matrix[word_embeddings_matrix.shape[0] -
                               1] = padding_value
        w2v_model = load_w2v_we(w2v_model_path)
        for k, v in wi.items():
            we = np.zeros(we_size)
            summed_something = False
            for wid in wids_to_merge[k]:
                if wid in w2v_model.keys():
                    we = np.sum((we, w2v_model[wid]), axis=0)
                    summed_something = True
            if summed_something:
                we = we / np.linalg.norm(we)  # normalize new word embedding
                word_embeddings_matrix[v] = we

    encoded_docs_by_name = {}
    sw = load_indri_stopwords()
    print('encoding data')
    for dn, dc in tqdm(text_by_name.items()):
        td = util.tokenize(dc, stemming=True, stoplist=sw)
        encoded_doc = [wi[w] for w in td if w in wi.keys()]
        util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name, wi, word_embeddings_matrix
示例#20
0
def compute_train_test_q_names(q_names):
    np.random.seed(0)
    if not os.path.isfile('test_q_names'):
        training_q_names = np.random.choice(q_names, 200, replace=False)
        test_q_names = [qn for qn in q_names if qn not in training_q_names]
        util.save_model(test_q_names, 'test_q_names')
        util.save_model(training_q_names, 'train_q_names')
    else:
        training_q_names = util.load_model('train_q_names')
        test_q_names = util.load_model('test_q_names')
    return training_q_names, test_q_names
def main():
    #filter_seasons = set([10])
    filter_seasons = None
    min_size = 56
    max_size = 64
    training, testing, charset = southpark.load_generative_data(
        min_size=min_size,
        max_size=max_size,
        filter_seasons=filter_seasons,
        dataset_size=500000)

    print("Dataset")
    print("  Training Size : {}".format(len(training[1])))
    print("  Testing Size  : {}".format(len(testing[1])))
    print("  Charset Size  : {}".format(len(charset)))
    print("  Charset       : {}".format(charset))
    print()

    print("Creating Model...")
    model = create_model(charset, max_size)

    batch_size = 128
    use_gpu_multi_batching = False

    if use_gpu_multi_batching:
        model = gpu_multi_batch(model,
                                training,
                                testing,
                                charset,
                                batch_size,
                                epochs=5,
                                num_gpu_batches=1000)
    else:
        #batch_gen = batch_sample_generator(dataset, charset, batch_size)
        batch_gen = batch_generator(training, charset, batch_size)

        print("Fitting Model...")

        filepath = "script_gen_best.hdf5"
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks_list = [checkpoint]

        model.fit_generator(batch_gen,
                            len(training[1]) // batch_size,
                            epochs=20,
                            use_multiprocessing=False,
                            callbacks=callbacks_list,
                            verbose=1)

    save_model("script_gen_20_epoch", model, charset, overwrite=True)
示例#22
0
def compute_kfolds_train_test(n_folds, q_names, coll):
    if not os.path.isfile('folds_' + coll):
        folds = []
        q_names = np.array(q_names)
        kf = KFold(n_splits=n_folds, random_state=0, shuffle=True)
        for train_index, test_index in kf.split(q_names):
            q_train, q_test = q_names[train_index], q_names[test_index]
            folds.append((q_train, q_test))
        util.save_model(folds, 'folds_' + coll)
    else:
        folds = util.load_model('folds_' + coll)
    return folds
示例#23
0
def main():
    best_acc = 0
    opt = parse_option()

    # build data loader
    train_loader, val_loader = set_loader(opt)

    # build model and criterion
    model, criterion = set_model(opt)

    # build optimizer
    optimizer = set_optimizer(opt, model)

    # tensorboard
    writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2)

    # training routine
    for epoch in range(1, opt.epochs + 1):
        adjust_learning_rate(opt, optimizer, epoch)

        # train for one epoch
        time1 = time.time()
        loss, train_acc = train(train_loader, model, criterion, optimizer,
                                epoch, opt)
        time2 = time.time()
        print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1))

        # tensorboard logger
        writer.add_scalar('train_loss', loss, global_step=epoch)
        writer.add_scalar('train_acc', train_acc, global_step=epoch)
        writer.add_scalar('learning_rate',
                          optimizer.param_groups[0]['lr'],
                          global_step=epoch)

        # evaluation
        loss, val_acc = validate(val_loader, model, criterion, opt)
        writer.add_scalar('val_loss', loss, global_step=epoch)
        writer.add_scalar('val_acc', val_acc, global_step=epoch)

        if val_acc > best_acc:
            best_acc = val_acc

        if epoch % opt.save_freq == 0:
            save_file = os.path.join(
                opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch))
            save_model(model, optimizer, opt, epoch, save_file)

    # save the last model
    save_file = os.path.join(opt.save_folder, 'last.pth')
    save_model(model, optimizer, opt, opt.epochs, save_file)

    print('best accuracy: {:.2f}'.format(best_acc))
def compute_data():
    ftext_model_path = '../data/fasttext_models/wiki.en.bin'
    output_path_wi_model = '../data/fasttext_models/wi_robust'
    output_path_ii_model = '../data/fasttext_models/ii_robust'
    output_path_idf_model = '../data/fasttext_models/idf_robust'
    output_path_encoded_d_model = '../data/fasttext_models/encoded_dbn'
    output_path_encoded_q_model = '../data/fasttext_models/encoded_qbn'
    output_path_we_matrix_model = '../data/fasttext_models/word_embeddings_matrix_robust'
    coll_path = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/corpus'
    queries_main_folder = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/topics'
    output_model_path = 'data/robust/stemmed_coll_model'
    encoded_out_folder_docs = 'data/robust/stemmed_encoded_docs_ft'

    stemming = True

    if not os.path.isfile(output_path_ii_model):
        print('computing inverted index')
        ii = compute_inverted_index(coll_path, stemming, output_path_ii_model)
        util.save_model(ii, output_path_ii_model)
    else:
        print('loading inverted index')
        ii = util.load_model(output_path_ii_model)

    if not os.path.isfile(output_path_encoded_d_model):
        text_dbn = read_collection(coll_path,
                                   output_model_path,
                                   stemming=stemming,
                                   stoplist=util.load_indri_stopwords())

        encoded_dbn, wi, we_matrix = compute_input_data(
            text_dbn, ftext_model_path, encoded_out_folder_docs)

        util.save_model(encoded_dbn, output_path_encoded_d_model)
        util.save_model(wi, output_path_wi_model)
        util.save_model(we_matrix, output_path_we_matrix_model)
    else:
        encoded_dbn = util.load_model(output_path_encoded_d_model)
        wi = util.load_model(output_path_wi_model)
        we_matrix = util.load_model(output_path_we_matrix_model)

    if not os.path.isfile(output_path_encoded_q_model):
        encoded_qbn = encode_queries(queries_main_folder, wi, stemming)
        util.save_model(encoded_qbn, output_path_encoded_q_model)
    else:
        encoded_qbn = util.load_model(output_path_encoded_q_model)

    idf_scores = du.compute_idf(coll_path, stemming, output_path_ii_model,
                                output_path_idf_model)

    return encoded_dbn, encoded_qbn, we_matrix, wi, ii, idf_scores
示例#25
0
def precompute_data(docs_proc_folder, queries_proc_folder, word_dict_path,
                    w2v_model_path, encoded_out_folder_docs,
                    encoded_out_folder_queries, output_path_wi_model,
                    output_path_we_matrix_model, output_path_encoded_q,
                    output_path_encoded_d, run_to_rerank, gt_file):
    # docs_proc_folder = '/media/alberto/DATA/ExperimentalCollections/ny/ny/nyt_proc_albe_2'
    # queries_proc_folder = '/media/alberto/DATA/ExperimentalCollections/ny/queries/queries_proc'
    # word_dict_path = 'data/word_dict.txt'
    # w2v_model_path = 'data/embed_wiki-pdc_d50_norm'
    # encoded_out_folder_docs = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/encoded_docs'
    # encoded_out_folder_queries = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/encoded_queries'
    # output_path_wi_model = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/word_index_stemmed'
    # output_path_we_matrix_model = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/word_embeddings_matrix'
    # output_path_encoded_q = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/q_by_name'
    # output_path_encoded_d = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/d_by_name_filtered'

    dbn, wi, word_embeddings_matrix = encode_collection_with_stemming(
        docs_proc_folder, word_dict_path, w2v_model_path,
        encoded_out_folder_docs)
    util.save_model(wi, output_path_wi_model)
    util.save_model(word_embeddings_matrix, output_path_we_matrix_model)

    qbn, wi, word_embeddings_matrix = encode_collection_with_stemming(
        queries_proc_folder, word_dict_path, w2v_model_path,
        encoded_out_folder_queries, wi, word_embeddings_matrix)

    dbn_filtered = keep_only_used_docs(gt_file, run_to_rerank,
                                       encoded_out_folder_docs)
    util.save_model(qbn, output_path_encoded_q)
    util.save_model(dbn_filtered, output_path_encoded_d)
示例#26
0
def main():
    args = get_arguments()
    SEED = args.seed
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(SEED)

    util.mkdir('runs')

    test_acc_file = 'runs/test' + str(args.cosine) + str(args.bna) + str(
        args.bnd) + '.txt'
    train_acc_file = 'runs/train' + str(args.cosine) + str(args.bna) + str(
        args.bnd) + '.txt'
    open(test_acc_file, 'w')
    open(train_acc_file, 'w')

    print('Building model, loading data...\n')
    if args.cuda:
        torch.cuda.manual_seed(SEED)

    model, optimizer, training_generator, test_generator = initialize(args)

    best_pred_loss = 1000.0
    print('\nCheckpoint folder:', args.save, '\n\nCosine:', args.cosine,
          '\t\tBna:', args.bna, '\t\tBnd:', args.bnd, '\t\tContrastive:',
          args.cont, '\n\nStart training...\n')

    for epoch in range(1, args.nEpochs + 1):
        train_metrics = train(args, model, training_generator, optimizer,
                              epoch)
        test_metrics, confusion_matrix, ucsd_correct_total, sars_correct_total, ucsd_test_total, sars_test_total \
            = validation(args, model, test_generator, epoch, mode='test')

        best_pred_loss = util.save_model(model, optimizer, args, test_metrics,
                                         epoch, best_pred_loss,
                                         confusion_matrix)

        print('COVID-CT Accuracy: {0:.2f}%\tSARS-Cov-2 Accuracy: {1:.2f}%\n'.
              format(100. * ucsd_correct_total / ucsd_test_total,
                     100. * sars_correct_total / sars_test_total))

        with open(test_acc_file, 'a+') as f:
            f.write(
                str(test_metrics.data['correct'] /
                    test_metrics.data['total']) + ' ' +
                str(optimizer.param_groups[0]['lr']) + ' ' +
                str(test_metrics.data['loss'] /
                    (test_metrics.data['total'] // args.batch_size + 1)) +
                '\n')
        with open(train_acc_file, 'a+') as f:
            f.write(
                str(train_metrics.data['correct'] /
                    train_metrics.data['total']) + ' ' +
                str(optimizer.param_groups[0]['lr']) + ' ' +
                str(train_metrics.data['loss'] /
                    (train_metrics.data['total'] // args.batch_size + 1)) +
                '\n')

        adjust_learning_rate(optimizer, epoch, args)
def compute_training_pairs(fold_idx, coll, n_iter_per_query, gt_file, dbn,
                           qbn):
    if not os.path.isfile('qn_rd_nrd_pairs_wn2v' + str(fold_idx) + '_' +
                          str(coll) + '_' + str(n_iter_per_query)):
        rd_b_qry = {}
        nrd_by_qry = {}
        for line in open(gt_file):
            data = line.split()
            qname = data[0].strip()
            dname = data[2].strip()
            if dname not in dbn.keys():
                continue
            rj = int(data[3].strip())
            if qname not in rd_b_qry.keys():
                rd_b_qry[qname] = []
                nrd_by_qry[qname] = []

            if rj > 0:
                rd_b_qry[qname].append(dname)
            else:
                nrd_by_qry[qname].append(dname)
        test_q_names = list(qbn.keys())
        np.random.shuffle(test_q_names)

        qn_rd_nrd_pairs = []
        for qn in test_q_names:
            if qn not in rd_b_qry.keys():
                continue
            tmp_rdocs = np.random.choice(rd_b_qry[qn],
                                         n_iter_per_query,
                                         replace=True)
            tmp_nrdocs = np.random.choice(nrd_by_qry[qn],
                                          n_iter_per_query,
                                          replace=True)
            for i in range(n_iter_per_query):
                qn_rd_nrd_pairs.append(
                    (qbn[qn], dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]]))

        np.random.shuffle(qn_rd_nrd_pairs)
        util.save_model(
            qn_rd_nrd_pairs,
            'qn_rd_nrd_pairs_w2v_gk' + str(fold_idx) + '_' + str(coll))
    else:
        qn_rd_nrd_pairs = util.load_model('qn_rd_nrd_pairs_w2v_gk' +
                                          str(fold_idx) + '_' + str(coll))
    return qn_rd_nrd_pairs
示例#28
0
def main():
    # Freeze VAE, only optimize retrieval model
    solver = optim.Adam(model.retrieval_params, lr=args.lr)

    for epoch in range(args.n_epoch):
        print('\n\n-------------------------------------------')
        print('Epoch-{}'.format(epoch))
        print('-------------------------------------------')

        model.train()

        train_iter = enumerate(udc.get_iter('train'))

        if not args.no_tqdm:
            train_iter = tqdm(train_iter)
            train_iter.set_description_str('Training')
            train_iter.total = udc.n_train // udc.batch_size

        for it, mb in train_iter:
            context, response, y, cm, rm = mb

            output = model.forward(context, response, cm)
            loss = F.binary_cross_entropy_with_logits(output, y)
            # loss = F.mse_loss(F.sigmoid(output), y)

            loss.backward()
            #clip_gradient_threshold(model, -10, 10)
            solver.step()
            solver.zero_grad()

        # Validation
        recall_at_ks = eval_model_v1(model,
                                     udc,
                                     'valid',
                                     gpu=args.gpu,
                                     no_tqdm=args.no_tqdm)

        print(
            'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}'
            .format(loss.data[0], recall_at_ks[0], recall_at_ks[1],
                    recall_at_ks[4]))

        if epoch > 4:
            eval_test()

        save_model(model, 'GRU_VAE_pretrained')
示例#29
0
def compare_models():
    """Compares several classifiers by performing Beyesian optimization on each one and
    then ranking the results.
    """
    train, test = load_data()
    model_configs = get_model_configs()

    results = []
    for configs in model_configs:
        best_result = hyperparam_search(configs, train, test)
        print("top 2 accuracy:",
              get_top_k_accuracy(best_result["model"], test, k=2))
        print(generate_confusion_matrix(best_result["model"], test))
        save_model(best_result)
        results.append(best_result)

    rank_results(results)
示例#30
0
def ge_cmd_learn():
	args = parse_arg_learn()
	
	# prepare input to GE_learn
	data = GE_data()
	data.dat = util.load_data(args.data)
	data.labeled_features = util.load_labeled_features(args.labeled_features)
	init_model = GE_model()
	param = GE_param()
	if args.l2:
		param.l2_regularization = args.l2
	final_model_path = args.model

	# print data

	final_model = GE_learn(data, init_model, param)
	util.save_model(final_model, final_model_path)
	return
def train():
    args = cli()

    device = torch.device("cuda" if args.gpu else "cpu")
    print(f'Device {device}')
    model = get_img_model(args.hidden_units, args.arch)

    optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)

    model.to(device)

    trainloader, _, validationloader, class_to_idx = load_data(args.data_dir)

    _train(optimizer, args.epochs, trainloader, validationloader, device,
           model)

    save_model(args.save_dir, model, class_to_idx, args.hidden_units,
               args.arch)
示例#32
0
def encode_coll(docs_text_path, wi, output_encoded_coll_path):
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(docs_text_path)):
        fp = os.path.join(docs_text_path, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())
    stoplist = load_indri_stopwords()
    encoded_coll_by_name = {}
    print('encoding collection')
    for tn, tt in tqdm(text_by_name.items()):
        tokenized = util.tokenize(tt, stemming=False, stoplist=stoplist)
        encoded_text = [wi[t] for t in tokenized if t in wi.keys()]
        encoded_coll_by_name[tn] = encoded_text
        util.save_model(encoded_text, os.path.join(output_encoded_coll_path,
                                                   tn))
    return encoded_coll_by_name
def main():
  """
  Train the TensorFlow models.
  """

  # Get hyper-parameters
  if os.path.exists(FLAGS.checkpoint_path) == False:
    os.makedirs(FLAGS.checkpoint_path)
  checkpoint_file_path = FLAGS.checkpoint_path + "/checkpoint.ckpt"
  latest_checkpoint_file_path = tf.train.latest_checkpoint(
      FLAGS.checkpoint_path)

  if os.path.exists(FLAGS.output_path) == False:
    os.makedirs(FLAGS.output_path)

  # Step 1: Construct the dataset op
  epoch_number = FLAGS.epoch_number
  if epoch_number <= 0:
    epoch_number = -1
  train_buffer_size = FLAGS.train_batch_size * 3
  validation_buffer_size = FLAGS.train_batch_size * 3

  train_filename_list = [filename for filename in FLAGS.train_files.split(",")]
  train_filename_placeholder = tf.placeholder(tf.string, shape=[None])
  if FLAGS.file_format == "tfrecords":
    train_dataset = tf.data.TFRecordDataset(train_filename_placeholder)
    train_dataset = train_dataset.map(parse_tfrecords_function).repeat(
        epoch_number).batch(FLAGS.train_batch_size).shuffle(
            buffer_size=train_buffer_size)
  elif FLAGS.file_format == "csv":
    # Skip the header or not
    train_dataset = tf.data.TextLineDataset(train_filename_placeholder)
    train_dataset = train_dataset.map(parse_csv_function).repeat(
        epoch_number).batch(FLAGS.train_batch_size).shuffle(
            buffer_size=train_buffer_size)
  train_dataset_iterator = train_dataset.make_initializable_iterator()
  train_features_op, train_label_op = train_dataset_iterator.get_next()

  validation_filename_list = [
      filename for filename in FLAGS.validation_files.split(",")
  ]
  validation_filename_placeholder = tf.placeholder(tf.string, shape=[None])
  if FLAGS.file_format == "tfrecords":
    validation_dataset = tf.data.TFRecordDataset(
        validation_filename_placeholder)
    validation_dataset = validation_dataset.map(
        parse_tfrecords_function).repeat(epoch_number).batch(
            FLAGS.validation_batch_size).shuffle(
                buffer_size=validation_buffer_size)
  elif FLAGS.file_format == "csv":
    validation_dataset = tf.data.TextLineDataset(
        validation_filename_placeholder)
    validation_dataset = validation_dataset.map(parse_csv_function).repeat(
        epoch_number).batch(FLAGS.validation_batch_size).shuffle(
            buffer_size=validation_buffer_size)
  validation_dataset_iterator = validation_dataset.make_initializable_iterator(
  )
  validation_features_op, validation_label_op = validation_dataset_iterator.get_next(
  )

  # Step 2: Define the model
  input_units = FLAGS.feature_size
  output_units = FLAGS.label_size
  logits = inference(train_features_op, input_units, output_units, True)

  if FLAGS.loss == "sparse_cross_entropy":
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=train_label_op)
    loss = tf.reduce_mean(cross_entropy, name="loss")
  elif FLAGS.loss == "cross_entropy":
    cross_entropy = tf.nn.cross_entropy_with_logits(
        logits=logits, labels=train_label_op)
    loss = tf.reduce_mean(cross_entropy, name="loss")
  elif FLAGS.loss == "mean_square":
    msl = tf.square(logits - train_label_op, name="msl")
    loss = tf.reduce_mean(msl, name="loss")

  global_step = tf.Variable(0, name="global_step", trainable=False)
  learning_rate = FLAGS.learning_rate

  if FLAGS.enable_lr_decay:
    logging.info(
        "Enable learning rate decay rate: {}".format(FLAGS.lr_decay_rate))
    starter_learning_rate = FLAGS.learning_rate
    learning_rate = tf.train.exponential_decay(
        starter_learning_rate,
        global_step,
        100000,
        FLAGS.lr_decay_rate,
        staircase=True)

  optimizer = util.get_optimizer_by_name(FLAGS.optimizer, learning_rate)
  train_op = optimizer.minimize(loss, global_step=global_step)

  # Need to re-use the Variables for training and validation
  tf.get_variable_scope().reuse_variables()

  # Define accuracy op and auc op for train
  train_accuracy_logits = inference(train_features_op, input_units,
                                    output_units, False)
  train_softmax_op, train_accuracy_op = model.compute_softmax_and_accuracy(
      train_accuracy_logits, train_label_op)
  train_auc_op = model.compute_auc(train_softmax_op, train_label_op,
                                   FLAGS.label_size)

  # Define accuracy op and auc op for validation
  validation_accuracy_logits = inference(validation_features_op, input_units,
                                         output_units, False)
  validation_softmax_op, validation_accuracy_op = model.compute_softmax_and_accuracy(
      validation_accuracy_logits, validation_label_op)
  validation_auc_op = model.compute_auc(validation_softmax_op,
                                        validation_label_op, FLAGS.label_size)

  # Define inference op
  inference_features = tf.placeholder(
      "float", [None, FLAGS.feature_size], name="features")
  inference_logits = inference(inference_features, input_units, output_units,
                               False)
  inference_softmax_op = tf.nn.softmax(
      inference_logits, name="inference_softmax")
  inference_prediction_op = tf.argmax(
      inference_softmax_op, 1, name="inference_prediction")
  keys_placeholder = tf.placeholder(tf.int32, shape=[None, 1], name="keys")
  keys_identity = tf.identity(keys_placeholder, name="inference_keys")

  signature_def_map = {
      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
      signature_def_utils.build_signature_def(
          inputs={
              "keys": utils.build_tensor_info(keys_placeholder),
              "features": utils.build_tensor_info(inference_features)
          },
          outputs={
              "keys": utils.build_tensor_info(keys_identity),
              "prediction": utils.build_tensor_info(inference_prediction_op),
          },
          method_name="tensorflow/serving/predictss"),
      "serving_detail":
      signature_def_utils.build_signature_def(
          inputs={
              "keys": utils.build_tensor_info(keys_placeholder),
              "features": utils.build_tensor_info(inference_features)
          },
          outputs={
              "keys": utils.build_tensor_info(keys_identity),
              "prediction": utils.build_tensor_info(inference_prediction_op),
              "softmax": utils.build_tensor_info(inference_softmax_op),
          },
          method_name="sdfas")
  }

  # Initialize saver and summary
  saver = tf.train.Saver()
  tf.summary.scalar("loss", loss)
  if FLAGS.scenario == "classification":
    tf.summary.scalar("train_accuracy", train_accuracy_op)
    tf.summary.scalar("train_auc", train_auc_op)
    tf.summary.scalar("validate_accuracy", validation_accuracy_op)
    tf.summary.scalar("validate_auc", validation_auc_op)
  summary_op = tf.summary.merge_all()
  init_op = [
      tf.global_variables_initializer(),
      tf.local_variables_initializer()
  ]

  # Step 3: Create session to run
  with tf.Session() as sess:
    writer = tf.summary.FileWriter(FLAGS.output_path, sess.graph)
    sess.run(init_op)
    sess.run(
        [
            train_dataset_iterator.initializer,
            validation_dataset_iterator.initializer
        ],
        feed_dict={
            train_filename_placeholder: train_filename_list,
            validation_filename_placeholder: validation_filename_list
        })

    if FLAGS.mode == "train":
      if FLAGS.resume_from_checkpoint:
        util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path)

      try:
        start_time = datetime.datetime.now()

        while True:
          if FLAGS.enable_benchmark:
            sess.run(train_op)
          else:

            _, global_step_value = sess.run([train_op, global_step])

            # Step 4: Display training metrics after steps
            if global_step_value % FLAGS.steps_to_validate == 0:
              if FLAGS.scenario == "classification":
                loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, validate_auc_value, summary_value = sess.run(
                    [
                        loss, train_accuracy_op, train_auc_op,
                        validation_accuracy_op, validation_auc_op, summary_op
                    ])
                end_time = datetime.datetime.now()

                logging.info(
                    "[{}] Step: {}, loss: {}, train_acc: {}, train_auc: {}, valid_acc: {}, valid_auc: {}".
                    format(end_time - start_time, global_step_value,
                           loss_value, train_accuracy_value, train_auc_value,
                           validate_accuracy_value, validate_auc_value))

              elif FLAGS.scenario == "regression":
                loss_value, summary_value = sess.run([loss, summary_op])
                end_time = datetime.datetime.now()
                logging.info("[{}] Step: {}, loss: {}".format(
                    end_time - start_time, global_step_value, loss_value))

              writer.add_summary(summary_value, global_step_value)
              saver.save(
                  sess, checkpoint_file_path, global_step=global_step_value)

              start_time = end_time

      except tf.errors.OutOfRangeError:
        if FLAGS.enable_benchmark:
          logging.info("Finish training for benchmark")
        else:
          # Step 5: Export the model after training
          util.save_model(
              FLAGS.model_path,
              FLAGS.model_version,
              sess,
              signature_def_map,
              is_save_graph=False)

    elif FLAGS.mode == "savedmodel":
      if util.restore_from_checkpoint(sess, saver,
                                      latest_checkpoint_file_path) == False:
        logging.error("No checkpoint for exporting model, exit now")
        return

      util.save_model(
          FLAGS.model_path,
          FLAGS.model_version,
          sess,
          signature_def_map,
          is_save_graph=False)

    elif FLAGS.mode == "inference":
      if util.restore_from_checkpoint(sess, saver,
                                      latest_checkpoint_file_path) == False:
        logging.error("No checkpoint for inference, exit now")
        return

      # Load test data
      inference_result_file_name = FLAGS.inference_result_file
      inference_test_file_name = FLAGS.inference_data_file
      inference_data = np.genfromtxt(inference_test_file_name, delimiter=",")
      inference_data_features = inference_data[:, 0:9]
      inference_data_labels = inference_data[:, 9]

      # Run inference
      start_time = datetime.datetime.now()
      prediction, prediction_softmax = sess.run(
          [inference_prediction_op, inference_softmax_op],
          feed_dict={inference_features: inference_data_features})
      end_time = datetime.datetime.now()

      # Compute accuracy
      label_number = len(inference_data_labels)
      correct_label_number = 0
      for i in range(label_number):
        if inference_data_labels[i] == prediction[i]:
          correct_label_number += 1
      accuracy = float(correct_label_number) / label_number

      # Compute auc
      y_true = np.array(inference_data_labels)
      y_score = prediction_softmax[:, 1]
      fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=1)
      auc = metrics.auc(fpr, tpr)
      logging.info("[{}] Inference accuracy: {}, auc: {}".format(
          end_time - start_time, accuracy, auc))

      # Save result into the file
      np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",")
      logging.info(
          "Save result to file: {}".format(inference_result_file_name))
def main():

  if os.path.exists(FLAGS.checkpoint_path) == False:
    os.makedirs(FLAGS.checkpoint_path)
  checkpoint_file_path = FLAGS.checkpoint_path + "/checkpoint.ckpt"
  latest_checkpoint_file_path = tf.train.latest_checkpoint(
      FLAGS.checkpoint_path)

  if os.path.exists(FLAGS.output_path) == False:
    os.makedirs(FLAGS.output_path)

  # Step 1: Construct the dataset op
  epoch_number = FLAGS.epoch_number
  if epoch_number <= 0:
    epoch_number = -1
  train_buffer_size = FLAGS.train_batch_size * 3
  validation_buffer_size = FLAGS.train_batch_size * 3

  train_filename_list = [filename for filename in FLAGS.train_files.split(",")]
  train_filename_placeholder = tf.placeholder(tf.string, shape=[None])
  train_dataset = tf.data.TFRecordDataset(train_filename_placeholder)
  train_dataset = train_dataset.map(parse_tfrecords_function).repeat(
      epoch_number).batch(FLAGS.train_batch_size).shuffle(
          buffer_size=train_buffer_size)
  train_dataset_iterator = train_dataset.make_initializable_iterator()
  batch_labels, batch_ids, batch_values = train_dataset_iterator.get_next()

  validation_filename_list = [
      filename for filename in FLAGS.validation_files.split(",")
  ]
  validation_filename_placeholder = tf.placeholder(tf.string, shape=[None])
  validation_dataset = tf.data.TFRecordDataset(validation_filename_placeholder)
  validation_dataset = validation_dataset.map(parse_tfrecords_function).repeat(
  ).batch(FLAGS.validation_batch_size).shuffle(
      buffer_size=validation_buffer_size)
  validation_dataset_iterator = validation_dataset.make_initializable_iterator(
  )
  validation_labels, validation_ids, validation_values = validation_dataset_iterator.get_next(
  )

  # Define the model
  logits = inference(batch_ids, batch_values, True)
  batch_labels = tf.to_int64(batch_labels)
  cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
      logits=logits, labels=batch_labels)
  loss = tf.reduce_mean(cross_entropy, name="loss")
  global_step = tf.Variable(0, name="global_step", trainable=False)
  if FLAGS.enable_lr_decay:
    logging.info(
        "Enable learning rate decay rate: {}".format(FLAGS.lr_decay_rate))
    starter_learning_rate = FLAGS.learning_rate
    learning_rate = tf.train.exponential_decay(
        starter_learning_rate,
        global_step,
        100000,
        FLAGS.lr_decay_rate,
        staircase=True)
  else:
    learning_rate = FLAGS.learning_rate
  optimizer = util.get_optimizer_by_name(FLAGS.optimizer, learning_rate)
  train_op = optimizer.minimize(loss, global_step=global_step)
  tf.get_variable_scope().reuse_variables()

  # Define accuracy op for train data
  train_accuracy_logits = inference(batch_ids, batch_values, False)
  train_softmax = tf.nn.softmax(train_accuracy_logits)
  train_correct_prediction = tf.equal(
      tf.argmax(train_softmax, 1), batch_labels)
  train_accuracy = tf.reduce_mean(
      tf.cast(train_correct_prediction, tf.float32))

  # Define auc op for train data
  batch_labels = tf.cast(batch_labels, tf.int32)
  sparse_labels = tf.reshape(batch_labels, [-1, 1])
  derived_size = tf.shape(batch_labels)[0]
  indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1])
  concated = tf.concat(axis=1, values=[indices, sparse_labels])
  outshape = tf.stack([derived_size, FLAGS.label_size])
  new_train_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0)
  _, train_auc = tf.contrib.metrics.streaming_auc(train_softmax,
                                                  new_train_batch_labels)

  # Define accuracy op for validate data
  validate_accuracy_logits = inference(validation_ids, validation_values,
                                       False)
  validate_softmax = tf.nn.softmax(validate_accuracy_logits)
  validate_batch_labels = tf.to_int64(validation_labels)
  validate_correct_prediction = tf.equal(
      tf.argmax(validate_softmax, 1), validate_batch_labels)
  validate_accuracy = tf.reduce_mean(
      tf.cast(validate_correct_prediction, tf.float32))

  # Define auc op for validate data
  validate_batch_labels = tf.cast(validate_batch_labels, tf.int32)
  sparse_labels = tf.reshape(validate_batch_labels, [-1, 1])
  derived_size = tf.shape(validate_batch_labels)[0]
  indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1])
  concated = tf.concat(axis=1, values=[indices, sparse_labels])
  outshape = tf.stack([derived_size, FLAGS.label_size])
  new_validate_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0)
  _, validate_auc = tf.contrib.metrics.streaming_auc(validate_softmax,
                                                     new_validate_batch_labels)

  # Define inference op
  sparse_index = tf.placeholder(tf.int64, [None, 2])
  sparse_ids = tf.placeholder(tf.int64, [None])
  sparse_values = tf.placeholder(tf.float32, [None])
  sparse_shape = tf.placeholder(tf.int64, [2])
  inference_ids = tf.SparseTensor(sparse_index, sparse_ids, sparse_shape)
  inference_values = tf.SparseTensor(sparse_index, sparse_values, sparse_shape)
  inference_logits = inference(inference_ids, inference_values, False)
  inference_softmax = tf.nn.softmax(inference_logits)
  inference_op = tf.argmax(inference_softmax, 1)
  keys_placeholder = tf.placeholder(tf.int32, shape=[None, 1])
  keys = tf.identity(keys_placeholder)

  signature_def_map = {
      signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
      signature_def_utils.build_signature_def(
          inputs={
              "keys": utils.build_tensor_info(keys_placeholder),
              "indexs": utils.build_tensor_info(sparse_index),
              "ids": utils.build_tensor_info(sparse_ids),
              "values": utils.build_tensor_info(sparse_values),
              "shape": utils.build_tensor_info(sparse_shape)
          },
          outputs={
              "keys": utils.build_tensor_info(keys),
              "softmax": utils.build_tensor_info(inference_softmax),
              "prediction": utils.build_tensor_info(inference_op)
          },
          method_name=signature_constants.PREDICT_METHOD_NAME)
  }

  # Initialize saver and summary
  saver = tf.train.Saver()
  tf.summary.scalar("loss", loss)
  tf.summary.scalar("train_accuracy", train_accuracy)
  tf.summary.scalar("train_auc", train_auc)
  tf.summary.scalar("validate_accuracy", validate_accuracy)
  tf.summary.scalar("validate_auc", validate_auc)
  summary_op = tf.summary.merge_all()
  init_op = [
      tf.global_variables_initializer(),
      tf.local_variables_initializer()
  ]

  # Create session to run
  with tf.Session() as sess:
    writer = tf.summary.FileWriter(FLAGS.output_path, sess.graph)
    sess.run(init_op)
    sess.run(
        train_dataset_iterator.initializer,
        feed_dict={train_filename_placeholder: train_filename_list})
    sess.run(
        validation_dataset_iterator.initializer,
        feed_dict={validation_filename_placeholder: validation_filename_list})

    if FLAGS.mode == "train":
      # Restore session and start queue runner
      util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path)
      coord = tf.train.Coordinator()
      threads = tf.train.start_queue_runners(coord=coord, sess=sess)
      start_time = datetime.datetime.now()

      try:
        while not coord.should_stop():
          if FLAGS.benchmark_mode:
            sess.run(train_op)
          else:
            _, step = sess.run([train_op, global_step])

            # Print state while training
            if step % FLAGS.steps_to_validate == 0:
              loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, auc_value, summary_value = sess.run(
                  [
                      loss, train_accuracy, train_auc, validate_accuracy,
                      validate_auc, summary_op
                  ])
              end_time = datetime.datetime.now()

              logging.info(
                  "[{}] Step: {}, loss: {}, train_acc: {}, train_auc: {}, valid_acc: {}, valid_auc: {}".
                  format(end_time - start_time, step, loss_value,
                         train_accuracy_value, train_auc_value,
                         validate_accuracy_value, auc_value))
              writer.add_summary(summary_value, step)
              saver.save(sess, checkpoint_file_path, global_step=step)
              start_time = end_time
      except tf.errors.OutOfRangeError:
        if FLAGS.benchmark_mode:
          print("Finish training for benchmark")
          exit(0)
        else:
          # Export the model after training
          util.save_model(
              FLAGS.model_path,
              FLAGS.model_version,
              sess,
              signature_def_map,
              is_save_graph=False)
      finally:
        coord.request_stop()
      coord.join(threads)

    elif FLAGS.mode == "save_model":
      if not util.restore_from_checkpoint(sess, saver,
                                          latest_checkpoint_file_path):
        logging.error("No checkpoint found, exit now")
        exit(1)

      util.save_model(
          FLAGS.model_path,
          FLAGS.model_version,
          sess,
          signature_def_map,
          is_save_graph=False)

    elif FLAGS.mode == "inference":
      if not util.restore_from_checkpoint(sess, saver,
                                          latest_checkpoint_file_path):
        logging.error("No checkpoint found, exit now")
        exit(1)

      # Load inference test data
      inference_result_file_name = "./inference_result.txt"
      inference_test_file_name = "./data/a8a_test.libsvm"
      labels = []
      feature_ids = []
      feature_values = []
      feature_index = []
      ins_num = 0
      for line in open(inference_test_file_name, "r"):
        tokens = line.split(" ")
        labels.append(int(tokens[0]))
        feature_num = 0
        for feature in tokens[1:]:
          feature_id, feature_value = feature.split(":")
          feature_ids.append(int(feature_id))
          feature_values.append(float(feature_value))
          feature_index.append([ins_num, feature_num])
          feature_num += 1
        ins_num += 1

      # Run inference
      start_time = datetime.datetime.now()
      prediction, prediction_softmax = sess.run(
          [inference_op, inference_softmax],
          feed_dict={
              sparse_index: feature_index,
              sparse_ids: feature_ids,
              sparse_values: feature_values,
              sparse_shape: [ins_num, FLAGS.feature_size]
          })

      end_time = datetime.datetime.now()

      # Compute accuracy
      label_number = len(labels)
      correct_label_number = 0
      for i in range(label_number):
        if labels[i] == prediction[i]:
          correct_label_number += 1
      accuracy = float(correct_label_number) / label_number

      # Compute auc
      expected_labels = np.array(labels)
      predict_labels = prediction_softmax[:, 0]
      fpr, tpr, thresholds = metrics.roc_curve(
          expected_labels, predict_labels, pos_label=0)
      auc = metrics.auc(fpr, tpr)
      logging.info("[{}] Inference accuracy: {}, auc: {}".format(
          end_time - start_time, accuracy, auc))

      # Save result into the file
      np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",")
      logging.info(
          "Save result to file: {}".format(inference_result_file_name))

    elif FLAGS.mode == "inference_with_tfrecords":
      if not util.restore_from_checkpoint(sess, saver,
                                          latest_checkpoint_file_path):
        logging.error("No checkpoint found, exit now")
        exit(1)

      # Load inference test data
      inference_result_file_name = "./inference_result.txt"
      inference_test_file_name = "./data/a8a/a8a_test.libsvm.tfrecords"

      batch_feature_index = []
      batch_labels = []
      batch_ids = []
      batch_values = []
      ins_num = 0

      # Read from TFRecords files
      for serialized_example in tf.python_io.tf_record_iterator(
          inference_test_file_name):
        # Get serialized example from file
        example = tf.train.Example()
        example.ParseFromString(serialized_example)
        label = example.features.feature["label"].float_list.value
        ids = example.features.feature["ids"].int64_list.value
        values = example.features.feature["values"].float_list.value
        #print("label: {}, features: {}".format(label, " ".join([str(id) + ":" + str(value) for id, value in zip(ids, values)])))
        batch_labels.append(label)
        # Notice that using extend() instead of append() to flatten the values
        batch_ids.extend(ids)
        batch_values.extend(values)
        for i in xrange(len(ids)):
          batch_feature_index.append([ins_num, i])

        ins_num += 1

      # Run inference
      start_time = datetime.datetime.now()
      prediction, prediction_softmax = sess.run(
          [inference_op, inference_softmax],
          feed_dict={
              sparse_index: batch_feature_index,
              sparse_ids: batch_ids,
              sparse_values: batch_values,
              sparse_shape: [ins_num, FLAGS.feature_size]
          })

      end_time = datetime.datetime.now()

      # Compute accuracy
      label_number = len(batch_labels)
      correct_label_number = 0
      for i in range(label_number):
        if batch_labels[i] == prediction[i]:
          correct_label_number += 1
      accuracy = float(correct_label_number) / label_number

      # Compute auc
      expected_labels = np.array(batch_labels)
      predict_labels = prediction_softmax[:, 0]
      fpr, tpr, thresholds = metrics.roc_curve(
          expected_labels, predict_labels, pos_label=0)
      auc = metrics.auc(fpr, tpr)
      logging.info("[{}] Inference accuracy: {}, auc: {}".format(
          end_time - start_time, accuracy, auc))

      # Save result into the file
      np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",")
      logging.info(
          "Save result to file: {}".format(inference_result_file_name))