Python Dataset примеры, src.data_utils.Dataset Python примеры использования

Пример #1

0

Показать файл

Файл: run_train_pico.py Проект: jind11/Deep-PICO-Detection

def main():
    # create instance of config
    config = Config(parser)
    assert config.data_keyname == 'pico'

    # build model
    model = HANNModel(config)
    model.build()

    # create datasets
    dev = Dataset(config.filename_dev, config.processing_word,
                         config.processing_tag)
    train = Dataset(config.filename_train, config.processing_word,
                         config.processing_tag)
    test = Dataset(config.filename_test, config.processing_word,
                         config.processing_tag)
    if config.num_augmentation:
        data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
    else:
        data_aug = None

    # train model
    model.train(train, dev, data_aug)

    # evaluate model
    model.restore_session(config.dir_model)
    model.evaluate(test)

Пример #2

0

Показать файл

Файл: build_data.py Проект: jind11/Deep-PICO-Detection

def main():
    """Procedure to build data

    You MUST RUN this procedure. It iterates over the whole dataset (train,
    dev and test) and extract the vocabularies in terms of words, tags, and
    characters. Having built the vocabularies it writes them in a file. The
    writing of vocabulary in a file assigns an id (the line #) to each word.
    It then extract the relevant GloVe vectors and stores them in a np array
    such that the i-th entry corresponds to the i-th word in the vocabulary.


    Args:
        config: (instance of Config) has attributes like hyper-params...

    """
    # get config and processing of words
    config = Config(load=False)
    processing_word = get_processing_word(lowercase=True)

    # Generators
    dev = Dataset(config.filename_dev, processing_word)
    test = Dataset(config.filename_test, processing_word)
    train = Dataset(config.filename_train, processing_word)

    # add data augmentation dataset
    data_aug = Dataset(config.filename_aug, processing_word)

    # Build Word and Tag vocab
    vocab_words_freq, vocab_tags = get_vocabs([train, dev, test, data_aug])
    vocab_words_freq_ = {}
    for vocab, freq in vocab_words_freq.items():
        if freq > config.min_freq:
            vocab_words_freq_[vocab] = freq
    vocab_tags.remove('None')
    # vocab_glove = get_wordvec_vocab(config.filename_wordvec)

    # vocab = vocab_words & vocab_glove
    vocab_words_freq_.update({UNK: 1, WORD_PAD: 1, NUM: 1})

    # vocab_tags.add(TAG_PAD)

    # Save vocab
    write_vocab(vocab_words_freq_, config.filename_words)
    write_vocab(vocab_tags, config.filename_tags)

    # Trim GloVe Vectors
    vocab, _ = load_vocab(config.filename_words)
    export_trimmed_wordvec_vectors(vocab, config.filename_wordvec,
                                   config.filename_wordvec_trimmed)

Пример #3

0

Показать файл

Файл: data_pre.py Проект: Anfankus/3dunet-tf

log.setLevel(logging.WARNING)

# get file paths to DICOM MRI scans and segmentation images
# note that leaderboard samples can be used for training
train_scan_files = glob('./data/raw/train/**/*.dcm', recursive=True)
train_scan_files += glob('../data/raw/leaderboard/**/*.dcm', recursive=True)
test_scan_files = glob('../data/raw/test/**/*.dcm', recursive=True)

# ProstateDx-01-0006_corrected_label.nrrd was renamed to ProstateDx-01-0006.nrrd
# In the leaderboard and test folders the _truth postfix have been removed from all nrrd files
train_seg_files = glob('../data/raw/train/**/*.nrrd', recursive=True)
train_seg_files += glob('../data/raw/leaderboard/**/*.nrrd', recursive=True)
test_seg_files = glob('../data/raw/test/**/*.nrrd', recursive=True)

# build datasets from file paths
train_dataset = Dataset(scan_files=train_scan_files, seg_files=train_seg_files)
test_dataset = Dataset(scan_files=test_scan_files, seg_files=test_seg_files)

train_n = len(train_dataset.patient_ids)
test_n = len(test_dataset.patient_ids)
train_scan_nums = [p.scans.shape[0] for p in train_dataset.patients.values()]
test_scan_nums = [p.scans.shape[0] for p in test_dataset.patients.values()]

print('Number of patients in train4 dataset: %d' % train_n)
print('Number of patients in test dataset: %d' % test_n)
print('Number of scans in train dataset: %d' % sum(train_scan_nums))
print('Number of scans in test dataset: %d' % sum(test_scan_nums))

# extract manufacturer and thickness sets from each patient
train_manufacturers = [
    p.manufacturers for p in train_dataset.patients.values()

Пример #4

0

Показать файл

def main():
    # create instance of config
    config = Config()
    assert config.data_keyname == 'pico'
    config.num_augmentation = 0
    config.batch_size = 20
    config.batch_size_aug = 20
    config.dir_output = 'test-num_augmentation-{}'.format(
        config.num_augmentation)
    config.dir_model = os.path.join(config.dir_output, "model.weights")
    config.data_root = '../data/{}/10_folds'.format(config.data_keyname)

    result_file_path = os.path.join(config.dir_output,
                                    'cross_validate_results')

    precisions = {'P': [], 'I': [], 'O': []}
    recalls = {'P': [], 'I': [], 'O': []}
    f1s = {'P': [], 'I': [], 'O': []}

    for fold in range(1, 11):
        # build model
        # tf.reset_default_graph()
        print('Fold {}'.format(fold))

        # build model
        model = HANNModel(config)
        model.build()
        # if config.restore:
        # model.restore_session("results/test/model.weights/") # optional, restore weights
        # model.reinitialize_weights("proj")

        # create datasets
        train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'),
                        config.processing_word, config.processing_tag)
        dev = Dataset(os.path.join(config.data_root, str(fold), 'dev.txt'),
                      config.processing_word, config.processing_tag)
        test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'),
                       config.processing_word, config.processing_tag)
        if config.num_augmentation:
            data_aug = Dataset(config.filename_aug,
                               config.processing_word,
                               max_iter=config.num_augmentation)
        else:
            data_aug = None

        # train model
        model.train(train, dev, data_aug)

        # evaluate model
        model.restore_session(config.dir_model)
        metrics = model.evaluate(test)

        [
            precisions[tag].append(metrics['precision'][tag])
            for tag in ['P', 'I', 'O']
        ]
        [
            recalls[tag].append(metrics['recall'][tag])
            for tag in ['P', 'I', 'O']
        ]
        [f1s[tag].append(metrics['f1'][tag]) for tag in ['P', 'I', 'O']]
        msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(
            fold, metrics['precision'], metrics['recall'], metrics['f1'])
        print(msg)
        with open(result_file_path, 'a') as ofile:
            ofile.write(msg)

    # print('Precision: ', 'P: ', (precisions['P']), 'I: ', (precisions['I']), 'O: ', (precisions['O']))
    # print('Recall: ', 'P: ', (recalls['P']), 'I: ', (recalls['I']), 'O: ', (recalls['O']))
    # print('F1: ', 'P: ', (f1s['P']), 'I: ', (f1s['I']), 'O: ', (f1s['O']))
    # print('Precision: ', 'P: ', np.mean(precisions['P']), 'I: ', np.mean(precisions['I']), 'O: ', np.mean(precisions['O']))
    # print('Recall: ', 'P: ', np.mean(recalls['P']), 'I: ', np.mean(recalls['I']), 'O: ', np.mean(recalls['O']))
    # res = np.mean([np.mean(values) for values in f1s.values()])
    # print('F1: ', 'P: ', np.mean(f1s['P']), 'I: ', np.mean(f1s['I']), 'O: ', np.mean(f1s['O']), 'all avg: ', res)
    msg = 'Average Precision: P: {}\tI: {}\tO: {}\n'.format(
        np.mean(precisions['P']), np.mean(precisions['I']),
        np.mean(precisions['O']))
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
    msg = 'Average Recall: P: {}\tI: {}\tO: {}\n'.format(
        np.mean(recalls['P']), np.mean(recalls['I']), np.mean(recalls['O']))
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
    res = np.mean([np.mean(values) for values in f1s.values()])
    msg = 'Average F1: P: {}\tI: {}\tO: {}\tall: {}\n'.format(
        np.mean(f1s['P']), np.mean(f1s['I']), np.mean(f1s['O']), res)
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
        ofile.write('\n\n\n')

Пример #5

0

Показать файл

Файл: input_fn.py Проект: Anfankus/3dunet-tf

def input_fn(training, params):
    """
    Simple input_fn for our 3D U-Net estimator, handling train and test data
    preparation.

    Args:
        training (bool): Whether we are training or testing.
        params (dict): Params for setting up the data. Expected keys are:
            max_scans (int): Maximum number of scans we see in any patient.
            train_img_size (int): Width and height of resized training images.
            batch_size (int): Number of of patient in each batch for training.
            num_classes (int): Number of mutually exclusive output classes.
            train_dataset_path (str): Path to pickled
                :class:`src.data_utils.Dataset` object.
            test_dataset_path (str): Path to pickled
                :class:`src.data_utils.Dataset` object.

    Returns:
        :class:`tf.dataset.Dataset`: An instantiated Dataset object.
    """
    package_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    
    # for training we use a batch number and pad each 3D scan to have equal
    # depth, width and height have already been set to 128 in preprocessing
    max_s = params['max_scans']
    w = h = params['train_img_size']
    if training:
        dataset = Dataset.load_dataset(
            os.path.join(package_root, params['train_dataset_path'])
        ).create_tf_dataset().shuffle(
            # we have 70 train examples, this will provide good shuffling
            buffer_size=70 
        ).repeat().padded_batch(
            batch_size=params['batch_size'],
            padded_shapes=(
                [max_s, w, h, 1], [max_s, w, h, params['num_classes']]
            )
        )

    # for testing we use the unscaled images with their original dims,
    # we still pad the depth dimension to max_s though
    else:
        # predicting a resized dataset, i.e. all have same width height?
        resized = 'resized' in params['test_dataset_path']
        dataset = Dataset.load_dataset(
            os.path.join(package_root, params['test_dataset_path'])
        ).create_tf_dataset(
            resized=resized
        ).padded_batch(
            # we have different sized test scans so we need batch 1
            batch_size=1,
            padded_shapes=(
                [max_s, None, None, 1],
                [max_s, None, None, params['num_classes']]
            )
        )

    iterator = tf.data.Iterator.from_structure(
        dataset.output_types,
        dataset.output_shapes
    )
    dataset_init_op = iterator.make_initializer(dataset)
    tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, dataset_init_op)
    next_element = iterator.get_next()

    # extremely hack way of getting tf.estimator to return labels at pred time
    # see https://github.com/tensorflow/tensorflow/issues/17824
    features = {'x': next_element[0], 'y': next_element[1]}
    return features, next_element[1]

Пример #6

0

Показать файл

Файл: run_train_cross_validate_nicta.py Проект: jind11/Deep-PICO-Detection

def main():
    # create instance of config
    config = Config()
    assert config.data_keyname == 'nicta'
    config.num_augmentation = 200000
    config.batch_size = 20
    config.batch_size_aug = 20
    config.attention_size = 50
    config.hidden_size_lstm_document = 200
    config.dropout = 0.8
    config.cnn_filter_num = 150
    config.adv_perturb_norm_length = 4
    config.va_perturb_norm_length = 4
    config.adv_reg_coeff = 0.3
    config.va_reg_coeff = 0.3
    config.data_root = '../data/nicta_piboso/10_folds'
    config.dir_output = 'results/nicta/test-num_augmentation-{}-va_coeff-{}-adv-coeff-{}'.format(config.num_augmentation,
                                                                                                 config.va_reg_coeff,
                                                                                                 config.adv_reg_coeff)
    config.dir_model = os.path.join(config.dir_output, "model.weights")

    result_file_path = os.path.join(config.dir_output, 'cross_validate_results')

    precisions = defaultdict(list)
    recalls = defaultdict(list)
    f1s = defaultdict(list)
    tag_ls = ['P', 'I', 'O', 'S', 'B', 'OT']

    for fold in range(1, 11):
        # build model
        # tf.reset_default_graph()
        print('Fold {}'.format(fold))

        # build model
        model = HANNModel(config)
        model.build()
        # if config.restore:
        # model.restore_session("results/test/model.weights/") # optional, restore weights
        # model.reinitialize_weights("proj")

        # create datasets
        train = Dataset(os.path.join(config.data_root, str(fold), 'train.txt'), config.processing_word,
                        config.processing_tag)
        dev = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
                      config.processing_tag)
        test = Dataset(os.path.join(config.data_root, str(fold), 'test.txt'), config.processing_word,
                       config.processing_tag)
        if config.num_augmentation:
            data_aug = Dataset(config.filename_aug, config.processing_word, max_iter=config.num_augmentation)
        else:
            data_aug = None

        # train model
        model.train(train, dev, data_aug)

        # evaluate model
        model.restore_session(config.dir_model)
        metrics = model.evaluate(test)

        [precisions[tag].append(metrics['precision_all'][tag]) for tag in tag_ls]
        [recalls[tag].append(metrics['recall_all'][tag]) for tag in tag_ls]
        [f1s[tag].append(metrics['f1_all'][tag]) for tag in tag_ls]
        msg = 'fold: {}\tprecision: {}\trecall: {}\tf1: {}\n'.format(fold, metrics['precision_all'],
                                                                     metrics['recall_all'], metrics['f1_all'])
        print(msg)
        with open(result_file_path, 'a') as ofile:
            ofile.write(msg)

    msg = 'Average Precision: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(precisions[tag])) for tag in tag_ls]))
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
    msg = 'Average Recall: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(recalls[tag])) for tag in tag_ls]))
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
    res = np.mean([np.mean(values) for values in f1s.values()])
    msg = 'Average F1: {}'.format('\t'.join(['{}: {}'.format(tag, np.mean(f1s[tag])) for tag in tag_ls]))
    print(msg)
    with open(result_file_path, 'a') as ofile:
        ofile.write(msg)
        ofile.write('\n\n\n')

Python Dataset примеры использования