Пример #1
0
def load_data(train_data_path, valid_data_path, test_data_path, seq_length):
    data_loader = DataLoader(train_data_path,
                             valid_data_path,
                             test_data_path,
                             seq_length=seq_length)
    data_loader.format()
    return data_loader.train_len, data_loader.train_data, data_loader.valid_len, \
        data_loader.valid_data, data_loader.test_len, data_loader.test_data
Пример #2
0
class EvalCallback(Callback):
    def _setup_graph(self):
        self.pred = self.trainer.get_predictor(get_eval_input_names(),
                                               get_eval_output_names())
        self.data_loader = DataLoader(audio_meta,
                                      hp.eval.batch_size).dataflow()

    def _trigger_epoch(self):
        _, mel_spec, speaker_id = next(self.data_loader.get_data())
        acc, = self.pred(mel_spec, speaker_id)
        self.trainer.monitors.put_scalar('eval/accuracy', acc)
Пример #3
0
"""
Load a subset of data required by configure file (cfg_target)
"""
import cfg_target
from data_load import DataLoader
data = DataLoader(cfg_target)

target = data.data_download_target()
covariates_us, covariates_sea, covariates_global, spatial_covariates, temporal_covariates = data.data_download_cov()
Пример #4
0
 def _setup_graph(self):
     self.pred = self.trainer.get_predictor(
         get_eval_input_names(),
         get_eval_output_names())
     self.data_loader = DataLoader(audio_meta, hp.eval.batch_size).dataflow()
Пример #5
0
    parser.add_argument('-ckpt', help='checkpoint to load model.')
    parser.add_argument('-gpu', help='comma separated list of GPU(s) to use.')
    parser.add_argument('-r', action='store_true', help='start training from the beginning.')
    parser.add_argument('-remote', action='store_true', help='use remote dataflow.')
    parser.add_argument('-port', type=int, default=0)
    args = parser.parse_args()

    # set hyper-parameters from yaml file
    hp.set_hparam_yaml(case=args.case)

    # dataflow
    audio_meta = AudioMeta(hp.train.data_path)
    if args.remote:
        df = get_remote_dataflow(args.port, hp.train.batch_size)
    else:
        df = DataLoader(audio_meta, hp.train.batch_size).dataflow(nr_prefetch=5000, nr_thread=int(multiprocessing.cpu_count() // 1.5))

    # set logger for event and model saver
    logger.set_logger_dir(hp.logdir)
    if True:
        train_conf = TrainConfig(
            model=ClassificationModel(num_classes=audio_meta.num_speaker, **hp.model),
            data=FlexibleQueueInput(df, capacity=500),
            callbacks=[
                ModelSaver(checkpoint_dir=hp.logdir),
                EvalCallback()
            ],
            steps_per_epoch=hp.train.steps_per_epoch,
            # session_config=session_config
        )
Пример #6
0
    parser.add_argument('--y', action='store_true')
    parser.add_argument('--evind', action='store_true')
    parser.add_argument('--all', action='store_true')

    parser.add_argument('vars', nargs=argparse.REMAINDER)

    args = parser.parse_args()

    base_dir = 'data/'

    if len(args.vars) == 0:
        print('no base dir provided, use ./data as default')
    else:
        base_dir = args.vars[0]

    du = DataLoader()
    du.load_data(sub_sample=False)

    if args.brand or args.all:
        gen_vectorized_data(du, ['brand_code'], 'brand', base_dir)
    if args.model or args.all:
        gen_vectorized_data(du, ['model_code'], 'model', base_dir)
    if args.label or args.all:
        gen_vectorized_data(du, ['label_id_bag'], 'label', base_dir)
    if args.app or args.all:
        gen_vectorized_data(du, ['app_id_bag'], 'appid', base_dir)
    if args.term or args.all:
        gen_term_data(du, base_dir)
    if args.y or args.all:
        gen_y(du, base_dir)
    if args.evind or args.all:
Пример #7
0
 def setUp(self):  # pylint: disable=g-missing-super-call
     self.loader = DataLoader("./data/train",
                              "./data/valid",
                              "./data/test",
                              seq_length=512)
Пример #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu',
                        type=int,
                        default=-1,
                        help='the gpu will be used, e.g "0,1,2,3"')
    parser.add_argument('--max_iter',
                        type=int,
                        default=10,
                        help='number of iterations')
    parser.add_argument('--decay_epoch',
                        type=int,
                        default=20,
                        help='number of iterations')
    parser.add_argument('--test',
                        type=bool,
                        default=False,
                        help='enable testing')
    parser.add_argument('--train_test',
                        type=bool,
                        default=True,
                        help='enable testing')
    parser.add_argument('--show',
                        type=bool,
                        default=True,
                        help='print progress')
    parser.add_argument('--init_std',
                        type=float,
                        default=0.1,
                        help='weight initialization std')
    parser.add_argument('--init_lr',
                        type=float,
                        default=0.01,
                        help='initial learning rate')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=0.75,
                        help='learning rate decay')
    parser.add_argument(
        '--final_lr',
        type=float,
        default=1E-5,
        help='learning rate will not decrease after hitting this threshold')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.9,
                        help='momentum rate')
    parser.add_argument('--max_grad_norm',
                        type=float,
                        default=3.0,
                        help='maximum gradient norm')
    parser.add_argument('--hidden_dim',
                        type=int,
                        default=128,
                        help='hidden layer dimension')
    parser.add_argument('--n_hidden',
                        type=int,
                        default=2,
                        help='hidden numbers')

    dataset = 'assist2009_updated'

    if dataset == 'oj':
        parser.add_argument('--batch_size',
                            type=int,
                            default=5,
                            help='the batch size')
        parser.add_argument('--qa_embed_dim',
                            type=int,
                            default=100,
                            help='answer and question embedding dimensions')
        parser.add_argument(
            '--n_question',
            type=int,
            default=68,
            help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen',
                            type=int,
                            default=200,
                            help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir',
                            type=str,
                            default='./data/oj',
                            help='data directory')
        parser.add_argument('--data_name',
                            type=str,
                            default='oj',
                            help='data set name')
        parser.add_argument('--load',
                            type=str,
                            default='oj',
                            help='model file to load')
        parser.add_argument('--save',
                            type=str,
                            default='oj',
                            help='path to save model')

    elif dataset == 'assistments':
        parser.add_argument('--batch_size',
                            type=int,
                            default=32,
                            help='the batch size')
        parser.add_argument('--qa_embed_dim',
                            type=int,
                            default=200,
                            help='answer and question embedding dimensions')
        parser.add_argument(
            '--n_question',
            type=int,
            default=124,
            help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen',
                            type=int,
                            default=200,
                            help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir',
                            type=str,
                            default='./data/assistments',
                            help='data directory')
        parser.add_argument('--data_name',
                            type=str,
                            default='assistments',
                            help='data set name')
        parser.add_argument('--load',
                            type=str,
                            default='assistments',
                            help='model file to load')
        parser.add_argument('--save',
                            type=str,
                            default='assistments',
                            help='path to save model')

    elif dataset == 'assist2009_updated':
        parser.add_argument('--batch_size',
                            type=int,
                            default=32,
                            help='the batch size')
        parser.add_argument('--qa_embed_dim',
                            type=int,
                            default=200,
                            help='answer and question embedding dimensions')
        parser.add_argument(
            '--n_question',
            type=int,
            default=110,
            help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen',
                            type=int,
                            default=200,
                            help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir',
                            type=str,
                            default='../../dataset/assist2009_updated',
                            help='data directory')
        parser.add_argument('--data_name',
                            type=str,
                            default='assist2009_updated',
                            help='data set name')
        parser.add_argument('--load',
                            type=str,
                            default='assist2009_updated',
                            help='model file to load')
        parser.add_argument('--save',
                            type=str,
                            default='assist2009_updated',
                            help='path to save model')

    elif dataset == 'STATICS':
        parser.add_argument('--batch_size',
                            type=int,
                            default=10,
                            help='the batch size')
        parser.add_argument('--qa_embed_dim',
                            type=int,
                            default=100,
                            help='answer and question embedding dimensions')
        parser.add_argument(
            '--n_question',
            type=int,
            default=1223,
            help='the number of unique questions in the dataset')
        parser.add_argument('--seqlen',
                            type=int,
                            default=800,
                            help='the allowed maximum length of a sequence')
        parser.add_argument('--data_dir',
                            type=str,
                            default='./data/STATICS',
                            help='data directory')
        parser.add_argument('--data_name',
                            type=str,
                            default='STATICS',
                            help='data set name')
        parser.add_argument('--load',
                            type=str,
                            default='STATICS',
                            help='model file to load')
        parser.add_argument('--save',
                            type=str,
                            default='STATICS',
                            help='path to save model')

    params = parser.parse_args()
    params.lr = params.init_lr

    print(params)

    dat = DataLoader(',', params.seqlen, 1, 0)
    # dat = DATA(n_question=params.n_question, seqlen=params.seqlen, separate_char=',')
    # train_data_path = params.data_dir + "/" + "builder_train.csv"
    # valid_data_path = params.data_dir + "/" + "builder_test.csv"

    train_data_path = params.data_dir + "/" + params.data_name + "_train1.csv"
    valid_data_path = params.data_dir + "/" + params.data_name + "_valid1.csv"
    # test_data_path = params.data_dir + "/" + params.data_name + "_test.csv"

    max_length, min_length, max_q_id = dat.scan_file(train_data_path)
    train_q_data, train_q_t_data, train_answer_data = dat.prepare_model_data(
        train_data_path, max_q_id)
    train_q_data = np.array(train_q_data)
    print(train_q_data.shape)
    train_q_t_data = np.array(train_q_t_data)
    train_answer_data = np.array(train_answer_data)

    valid_q_data, valid_q_t_data, valid_answer_data = dat.prepare_model_data(
        valid_data_path, max_q_id)
    valid_q_data = np.array(valid_q_data)
    valid_q_t_data = np.array(valid_q_t_data)
    valid_answer_data = np.array(valid_answer_data)
    # train_q_data, train_q_t_data, train_answer_data = dat.load_data(train_data_path)
    # valid_q_data, valid_q_t_data, valid_answer_data = dat.load_data(valid_data_path)
    # test_q_data, test_q_t_data, test_answer_data = dat.load_data(test_data_path)

    model = MODEL(n_question=params.n_question,
                  hidden_dim=params.hidden_dim,
                  x_embed_dim=params.qa_embed_dim,
                  hidden_layers=params.n_hidden,
                  gpu=params.gpu)

    model.init_embeddings()
    model.init_params()
    # model = torch.load(params.data_dir + "/save/"+params.save)
    # optimizer = optim.SGD(params=model.parameters(), lr=params.lr, momentum=params.momentum)
    optimizer = optim.Adam(params=model.parameters(),
                           lr=params.lr,
                           betas=(0.9, 0.9))

    if params.gpu >= 0:
        print('device: ' + str(params.gpu))
        torch.cuda.set_device(params.gpu)
        model.cuda()

    all_train_loss = {}
    all_train_accuracy = {}
    all_train_auc = {}
    all_valid_loss = {}
    all_valid_accuracy = {}
    all_valid_auc = {}
    best_valid_auc = 0

    for idx in range(params.max_iter):
        train_loss, train_accuracy, train_auc = train(model, idx, params,
                                                      optimizer, train_q_data,
                                                      train_q_t_data,
                                                      train_answer_data)
        print(
            'Epoch %d/%d, loss : %3.5f, auc : %3.5f, accuracy : %3.5f' %
            (idx + 1, params.max_iter, train_loss, train_auc, train_accuracy))
        valid_loss, valid_accuracy, valid_auc = test(model, params, optimizer,
                                                     valid_q_data,
                                                     valid_q_t_data,
                                                     valid_answer_data)
        print('Epoch %d/%d, valid auc : %3.5f, valid accuracy : %3.5f' %
              (idx + 1, params.max_iter, valid_auc, valid_accuracy))
        # test_loss, test_accuracy, test_auc = test(model, params, optimizer, test_q_data, test_q_t_data,
        #                                           test_answer_data)
        # print('Epoch %d/%d, test auc : %3.5f, test accuracy : %3.5f' % (
        #     idx + 1, params.max_iter, test_auc, test_accuracy))
        all_train_auc[idx + 1] = train_auc
        all_train_accuracy[idx + 1] = train_accuracy
        all_train_loss[idx + 1] = train_loss
        all_valid_loss[idx + 1] = valid_loss
        all_valid_accuracy[idx + 1] = valid_accuracy
        all_valid_auc[idx + 1] = valid_auc
        #
        # output the epoch with the best validation auc
        if valid_auc > best_valid_auc:
            print('%3.4f to %3.4f' % (best_valid_auc, valid_auc))
            best_valid_auc = valid_auc
Пример #9
0
        post_optimizer = self.build_optimizer(post_loss, "post_optimizer")

        inputs = {"initial_state": initial_state, "encoder_inputs": encode_inputs, "encoder_inputs_len": encode_inputs_len,
                "pre_decoder_inputs": pre_decode_inputs, "pre_decoder_inputs_len": pre_decode_inputs_len, "pre_decoder_targets": pre_decode_targets,
                "post_decoder_inputs": post_decode_inputs, "post_decoder_inputs_len": post_decode_inputs_len, "post_decoder_targets": post_decode_targets
                }
        pre_decoder = {"pre_optimizer": pre_optimizer, "pre_loss": pre_loss, "pre_state": pre_final_state}
        post_decoder = {"post_optimizer": post_optimizer, "post_loss": post_loss, "post_state": post_final_state}

        return inputs, pre_decoder, post_decoder


vocab_size = 50000
epoch_num = 1000000
batch_size = 128
data_loader = DataLoader("doupocangqiong.txt", data_file_format="utf8", vocabulary_size=vocab_size, stop_word_file="stop_words.txt", use_jieba=True)
#data_loader = DataLoader("abc_news.txt", data_file_format="ascii", vocabulary_size=vocab_size)
data_loader.load()


def main():
    sen2vec = SkipThroughSen2vec(vocab_size=vocab_size, embedding_dim=64, num_units=64, batch_size=batch_size, learning_rate=1.0)
    inputs, pre_decoder, post_decoder = sen2vec.build()
    sess = tf.Session()
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())
    new_state = sess.run(inputs['initial_state'])
    step = 0
    while step < epoch_num:
        cur_inputs, cur_inputs_len, pre_inputs, pre_inputs_len, pre_targets, post_inputs, post_inputs_len, post_targets = data_loader.generate_skip_through_batch(batch_size)
        feed = {inputs["initial_state"]: new_state, 
Пример #10
0
from pca2d import PCA2D
from cnn_svm import CNN_SVM
import numpy as np
import cv2 as cv2
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn

from sklearn import metrics
sn.set(style='white', context='notebook', palette='deep')

cur_dataset = Dataset.TU
#cur_dataset = Dataset.YALE_EX_CROPPED           #Yale Cropped
dataloader = DataLoader(cur_dataset, {'angle_limit': 10, 'img_format': None})
X, y, z, v = dataloader.load_data(reload=False)
#X = X / 255

plt.figure(1)
g = sn.countplot(y)

#z is the number of classes
#v is the map of labels to which 0-num_classes correspond

num_val_splits = 10
labels = list(range(0, z))

#CNN + SVM 10-fold crossvalidation
X_shape = (X.shape[1], X.shape[2], X.shape[3])
#X_shape =  (X.shape[0], X.shape[1], X.shape[2])     # Yale Cropped
Пример #11
0
    parser.add_argument('-ckpt', help='checkpoint to load model.')
    args = parser.parse_args()

    hp.set_hparam_yaml(args.case)

    # model
    audio_meta_train = VoxCelebMeta(hp.train.data_path, hp.train.meta_path)
    model = ClassificationModel(num_classes=audio_meta_train.num_speaker, **hp.model)

    # data loader
    audio_meta_class = globals()[hp.embed.audio_meta_class]
    params = {'data_path': hp.embed.data_path}
    if hp.embed.meta_path:
            params['meta_path'] = hp.embed.meta_path
    audio_meta = audio_meta_class(**params)
    data_loader = DataLoader(audio_meta, hp.embed.batch_size)

    # samples
    wav, mel_spec, speaker_id = data_loader.dataflow().get_data().next()

    ckpt = args.ckpt if args.ckpt else tf.train.latest_checkpoint(hp.logdir)

    pred_conf = PredictConfig(
        model=model,
        input_names=['x'],
        output_names=['embedding/embedding', 'prediction'],
        session_init=SaverRestore(ckpt) if ckpt else None)
    embedding_pred = OfflinePredictor(pred_conf)

    embedding, pred_speaker_id = embedding_pred(mel_spec)
Пример #12
0
from tf_transformer import Transformer
from data_load import DataLoader
# import time

logging.basicConfig(level=logging.INFO)


logging.info("# hparams")
hparams = Hparams()
parser = hparams.parser
hp = parser.parse_args()
print(hp)
# save_hparams(hp, hp.logdir)

logging.info("# Prepare train/eval batches")
dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)

xs = tf.placeholder(name='xs', dtype=tf.int32, shape=[16, 100])
ys1 = tf.placeholder(name='ys1', dtype=tf.int32, shape=[16, 99])
ys2 = tf.placeholder(name='ys2', dtype=tf.int32, shape=[16, 99])

logging.info("# Load model")
m = Transformer(hp)
loss = m.train(xs, (ys1, ys2))
nonpadding = tf.to_float(tf.not_equal(ys2, dataloader.get_pad()))  # 0: <pad>
loss = tf.reduce_sum(loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

global_step = tf.train.get_or_create_global_step()
optimizer = tf.train.GradientDescentOptimizer(hp.lr)
train_op = optimizer.minimize(loss, global_step=global_step)
# y_hat, eval_summaries = m.eval(xs, ys)
Пример #13
0
    def save_model(self, path):
        self.saver.save(self.session, path)


#valid_words = [u'萧炎',u'灵魂',u'火焰',u'萧薰儿',u'药老',u'天阶',u"云岚宗",u"乌坦城",u"惊诧", u"少女"]
##valid_words = [u'斗破']
#valid_word_examples =[dictionary[li] for li in valid_words]
#valid_size = len(valid_word_examples)
#
#valid_sentence_examples = [4, 10, 11]
#valid_sentence_size = len(valid_sentence_examples)

#data_loader = DataLoader("doupocangqiong.txt", "utf8", vocabulary_size=50000, stop_word_file="stop_words.txt")
VOCA_SIZE = 10000
data_loader = DataLoader("abc_news.txt", "ascii", vocabulary_size=VOCA_SIZE)
data_loader.load()
sentence_size = len(data_loader.line_list)

#batch, labels = data_loader.generate_batch_pvdm(2, 2)
#print 'batch', batch
#print 'labels', labels

embedding_word_size = 64
embedding_sentence_size = 64
batch_size = 128
window_size = 2
num_sampled = 64

valid_sentence_examples = [1, 2]
valid_word_examples = [8, 15]
Пример #14
0
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from pipeline import Pipeline
from models.cnn import CNN
from models.svm import SVM_C

from data_load import DataLoader, Dataset
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sn

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

dataloader = DataLoader(Dataset.PKU_MMD, 'PKUMMD', settings={'full': True})
X, y, z, v = dataloader.load_data(reload=False)

conf_mat = np.zeros((z, z))

cnn = CNN([], z, X[0].shape, X[1].shape)
svm = SVM_C([])

pipeline = Pipeline(X, y, z, "PKUMMD_full", cnn, svm)
results = pipeline.train(10, 10, 100)

print("Final accuracy: " + str(results[1]))
sn.heatmap(results[0], annot=True, annot_kws={"size": 10},
           fmt='g')  # font size
plt.show()
Пример #15
0
from hetu import gpu_ops as ad
from hetu import optimizer
from hetu import ndarray
import numpy as np
# import time

logging.basicConfig(level=logging.INFO)

logging.info("# hparams")
hparams = Hparams()
parser = hparams.parser
hp = parser.parse_args()
print(hp)

logging.info("# Prepare train/eval batches")
dataloader = DataLoader(hp.train1, hp.train2, hp.maxlen1, hp.maxlen2, hp.vocab)

ctx = ndarray.gpu(1)
xs = ad.Variable(name='xs')
ys1 = ad.Variable(name='ys1')
ys2 = ad.Variable(name='ys2')
nonpadding = ad.Variable(name='nonpadding')

logging.info("# Load model")
m = Transformer(hp)
loss = m.train(xs, (ys1, ys2))
loss = ad.div_op(ad.reduce_sum_op(loss * nonpadding, axes=[0, 1]),
                 ad.reduce_sum_op(nonpadding, axes=[0, 1]) + 1e-7)
opt = optimizer.SGDOptimizer(hp.lr)
train_op = opt.minimize(loss)
executor = ad.Executor([loss, train_op], ctx=ctx)
Пример #16
0
    else:
        print('pre_notice')
        model = module_f5()
        if torch.cuda.is_available():
            print('using cuda')
            model = model.cuda()
        model.load_state_dict(torch.load(s_model))
        model.eval()
    return model


if __name__ == '__main__':
    pre_batch_size = 1500
    pre_path = r'D:\PROJECT\mature_to_imm\data\pre_data.csv'
    pre_data_l = Im_data(in_path=pre_path)
    pre_data = DataLoader(pre_data_l, batch_size=pre_batch_size, shuffle=False)
    sl = [('105358.pkl', '105358.csv'), ('105639.pkl', '105639.csv'),
          ('105854.pkl', '105854.csv'), ('110159.pkl', '110159.csv'),
          ('110440.pkl', '110440.csv')]
    for i in range(len(sl)):
        smodel = r'model/allgene/' + sl[i][0]
        s_model = select(i, smodel)
        pre_dict(s_model=s_model, pre_data=pre_data, save_name=sl[i][1])
    #输入数据
    # smodel = r'D:\PROJECT\md\model\allgene\105358.pkl'
    # # # smode2 = r'C:\Users\ZML15\Desktop\mature_to_imm\out_p\05-28_15_44P2.pkl'
    # pre_batch_size = 1500
    # pre_path = r'D:\PROJECT\mature_to_imm\data\pre_data.csv'
    # pre_data_l = Im_data(in_path=pre_path)
    # pre_data = DataLoader(pre_data_l, batch_size=pre_batch_size, shuffle=False)
    # pre_dict(s_model=smodel,pre_data=pre_data,save_name='6354.csv')
Пример #17
0
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


def savecounter(filename, arr1, arr2, arr3):
    with open(filename, 'wb') as fl:
        np.savez(fl, arr1, arr2, arr3)


import seaborn as sn
import matplotlib.pyplot as plt

#cur_dataset = Dataset.TU
#dataloader = DataLoader(cur_dataset, {'angle_limit':10, 'img_format':None})

cur_dataset = Dataset.YALE_EX_CROPPED
dataloader = DataLoader(cur_dataset, {'resize': True})
X, y, z, v = dataloader.load_data(reload=False)

#X = X/255.0
#z is the number of classes
#v is the map of labels to which 0-num_classes correspond

num_val_splits = 10
#Find optimal parameters CNN

X_shape = (X.shape[1], X.shape[2])

# #CNN params
# epochs_arr = [2]
# batch_count_arr = [10,20,30]
# conv_layers_count_arr = [1,2,3]
Пример #18
0
class TestLoad(unittest.TestCase):
    def setUp(self):  # pylint: disable=g-missing-super-call
        self.loader = DataLoader("./data/train",
                                 "./data/valid",
                                 "./data/test",
                                 seq_length=512)

    def test_get_data(self):
        self.assertIsInstance(self.loader.train_data, list)
        self.assertIsInstance(self.loader.train_label, list)
        self.assertIsInstance(self.loader.valid_data, list)
        self.assertIsInstance(self.loader.valid_label, list)
        self.assertIsInstance(self.loader.test_data, list)
        self.assertIsInstance(self.loader.test_label, list)
        self.assertEqual(self.loader.train_len, len(self.loader.train_data))
        self.assertEqual(self.loader.train_len, len(self.loader.train_label))
        self.assertEqual(self.loader.valid_len, len(self.loader.valid_data))
        self.assertEqual(self.loader.valid_len, len(self.loader.valid_label))
        self.assertEqual(self.loader.test_len, len(self.loader.test_data))
        self.assertEqual(self.loader.test_len, len(self.loader.test_label))

    def test_pad(self):
        original_data1 = [[2, 3], [1, 1]]
        expected_data1_0 = [[2, 3], [2, 3], [2, 3], [2, 3], [1, 1]]
        expected_data1_1 = [[2, 3], [1, 1], [1, 1], [1, 1], [1, 1]]
        original_data2 = [[-2, 3], [-77, -681], [5, 6], [9, -7], [22, 3333],
                          [9, 99], [-100, 0]]
        expected_data2 = [[-2, 3], [-77, -681], [5, 6], [9, -7], [22, 3333]]
        padding_data1 = self.loader.pad(original_data1, seq_length=5, dim=2)
        padding_data2 = self.loader.pad(original_data2, seq_length=5, dim=2)
        for i in range(len(padding_data1[0])):
            for j in range(len(padding_data1[0].tolist()[0])):
                self.assertLess(
                    abs(padding_data1[0].tolist()[i][j] -
                        expected_data1_0[i][j]), 10.001)
        for i in range(len(padding_data1[1])):
            for j in range(len(padding_data1[1].tolist()[0])):
                self.assertLess(
                    abs(padding_data1[1].tolist()[i][j] -
                        expected_data1_1[i][j]), 10.001)
        self.assertEqual(padding_data2[0].tolist(), expected_data2)
        self.assertEqual(padding_data2[1].tolist(), expected_data2)

    def test_format(self):
        self.loader.format()
        expected_train_label = int(
            self.loader.label2id[self.loader.train_label[0]])
        expected_valid_label = int(
            self.loader.label2id[self.loader.valid_label[0]])
        expected_test_label = int(
            self.loader.label2id[self.loader.test_label[0]])
        for feature, label in self.loader.train_data:  # pylint: disable=unused-variable
            format_train_label = label.numpy()
            break
        for feature, label in self.loader.valid_data:
            format_valid_label = label.numpy()
            break
        for feature, label in self.loader.test_data:
            format_test_label = label.numpy()
            break
        self.assertEqual(expected_train_label, format_train_label)
        self.assertEqual(expected_valid_label, format_valid_label)
        self.assertEqual(expected_test_label, format_test_label)
        self.assertIsInstance(self.loader.train_data, tf.data.Dataset)
        self.assertIsInstance(self.loader.valid_data, tf.data.Dataset)
        self.assertIsInstance(self.loader.test_data, tf.data.Dataset)
Пример #19
0
# !/usr/bin/env python
import argparse
from tensorpack.dataflow.remote import send_dataflow_zmq
from data_load import DataLoader, AudioMeta
from hparam import hparam as hp
import multiprocessing

if __name__ == '__main__':
    # get arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('case', type=str, help='experiment case name.')
    parser.add_argument('-data_path', type=str)
    parser.add_argument('-dest_url', type=str)
    parser.add_argument('-num_thread', type=int, default=1)
    args = parser.parse_args()

    # set hyper-parameters from yaml file
    hp.set_hparam_yaml(case=args.case)

    if args.data_path:
        hp.train.data_path = args.data_path

    # dataflow
    audio_meta = AudioMeta(hp.train.data_path)
    data_loader = DataLoader(audio_meta, 1)
    num_thread = args.num_thread if args.num_thread else multiprocessing.cpu_count(
    ) // 1.5
    data_loader = data_loader.dataflow(nr_prefetch=5000,
                                       nr_thread=args.num_thread)

    send_dataflow_zmq(data_loader, args.dest_url)