Exemplo n.º 1
0
    assert args['model'] in model_cfg.cfg['models'], \
        "model %s not found in config/models.yaml" % args['model']

    image_processing = model_cfg.cfg['models'][
        args['model']]['image_processing']
    image_processing['ignore_aspect_ratio'] = args['ignore_aspect_ratio']

    input_shape = (image_processing['output_height'],
                   image_processing['output_width'], 3)

    # Add 'label/' prefix to labels as they are stored in the .tfrecord files
    output_labels = args['labels']
    output_labels_clean = ['label/' + x for x in output_labels]

    # Class to numeric mappings and number of classes per label
    class_mapping = read_json(args['class_mapping_json'])
    # TODO: fix num classes per label for a:0, b:0 cases
    n_classes_per_label_dict = {
        c: len(set(class_mapping[o].values()))
        for o, c in zip(output_labels, output_labels_clean)
    }
    n_classes_per_label = [
        n_classes_per_label_dict[x] for x in output_labels_clean
    ]

    # save class mapping file to current run path
    export_dict_to_json(
        class_mapping,
        os.path.join(args['run_outputs_dir'], 'label_mappings.json'))

    # Find TFR files
Exemplo n.º 2
0
from training.prepare_model import create_model

tfr_encoder_decoder = DefaultTFRecordEncoderDecoder()

data_reader = DatasetReader(tfr_encoder_decoder.decode_record)


tfr_train = find_tfr_files_pattern('./test_big/cats_vs_dogs/tfr_files/',
                            'train')

output_labels = ['class']
output_labels_clean = ['label/class']

class_mapp = './test_big/cats_vs_dogs/tfr_files/label_mapping.json'

class_mapping = read_json(class_mapp)
n_classes_per_label_dict = {c: len(class_mapping[o]) for o, c in
                            zip(output_labels, output_labels_clean)}
n_classes_per_label = [n_classes_per_label_dict[x]
                       for x in output_labels_clean]
index_to_class = {k: {vv: kk for kk, vv in v.items()} for k, v in class_mapping.items()}


# Load model config
model_cfg = ConfigLoader('./config/models.yaml')

image_processing = model_cfg.cfg['models']['small_cnn']['image_processing']

# Calculate Dataset Image Means and Stdevs for a dummy batch
dataset = data_reader.get_iterator(
        tfr_files=tfr_train,
def create_input_files(dataset,
                       karpathy_json_path,
                       image_folder,
                       min_word_freq,
                       output_folder,
                       max_len=100,
                       vocab_size=None,
                       is_write_img=True):
    """
    Creates input files for training, validation, and test data.

    :param dataset: name of dataset, one of 'coco', 'flickr8k', 'flickr30k'
    :param karpathy_json_path: path of Karpathy JSON file with splits and captions
    :param image_folder: folder with downloaded images
    :param captions_per_image: number of captions to sample per image
    :param min_word_freq: words occuring less frequently than this threshold are binned as <unk>s
    :param output_folder: folder to save files
    :param max_len: don't sample captions longer than this length
    """
    assert dataset in {'coco', 'flickr8k', 'flickr30k'}
    captions_per_image = 1
    base_filename = f'{dataset}_{str(captions_per_image)}_cap_per_img_{str(min_word_freq)}_min_word_freq'

    data = read_json(karpathy_json_path)

    # create and save word_freq
    word_map, all_captions = create_word_map_from_pretrained_wordpiece(
        data, base_filename, output_folder, min_word_freq, max_len, vocab_size)

    # streamline data and write id, styles data
    partition_dict = create_nonimage_input(data, word_map, all_captions,
                                           base_filename, image_folder,
                                           output_folder)
    train_image_paths, train_image_captions = partition_dict['train']
    val_image_paths, val_image_captions = partition_dict['val']
    test_image_paths, test_image_captions = partition_dict['test']

    # write encoded captions and cap lengths as json
    for impaths, imcaps, split in [
        (train_image_paths, train_image_captions, 'TRAIN'),
        (val_image_paths, val_image_captions, 'VAL'),
        (test_image_paths, test_image_captions, 'TEST')
    ]:
        enc_captions, caplens = [], []
        for i, path in enumerate(tqdm(impaths)):
            # sample captions
            if len(imcaps[i]) < captions_per_image:
                captions = imcaps[i] + [
                    choice(imcaps[i])
                    for _ in range(captions_per_image - len(imcaps[i]))
                ]
            else:
                captions = sample(imcaps[i], k=captions_per_image)

            # sanity check
            assert len(captions) == captions_per_image

            c = captions[0]
            # Encode captions
            enc_c = [word_map['<start>']] + [
                word_map.get(word, word_map['<unk>']) for word in c
            ] + [word_map['<end>']] + [word_map['<pad>']] * (max_len - len(c))

            # Find caption lengths
            c_len = len(c) + 2

            enc_captions.append(enc_c)
            caplens.append(c_len)

            assert len(enc_captions) == len(caplens)

        # write json
        captions_json = os.path.join(
            output_folder, split + '_CAPTIONS_' + base_filename + '.json')
        write_json(enc_captions, captions_json)
        caplens_json = os.path.join(
            output_folder, split + '_CAPLENS_' + base_filename + '.json')
        write_json(caplens, caplens_json)

    if not is_write_img:
        return None

    # sample captions for each image, save images to HDF5 file, and captions and their lengths to JSON files
    seed(123)
    for impaths, imcaps, split in [
        (train_image_paths, train_image_captions, 'TRAIN'),
        (val_image_paths, val_image_captions, 'VAL'),
        (test_image_paths, test_image_captions, 'TEST')
    ]:

        with h5py.File(
                os.path.join(output_folder,
                             split + '_IMAGES_' + base_filename + '.hdf5'),
                'a') as h:
            # Make a note of the number of captions we are sampling per image
            h.attrs['captions_per_image'] = captions_per_image

            # Create dataset inside HDF5 file to store images
            images = h.create_dataset('images', (len(impaths), 3, 256, 256),
                                      dtype='uint8')

            print("\nReading %s images and captions, storing to file...\n" %
                  split)

            for i, path in enumerate(tqdm(impaths)):

                # Sanity check
                assert len(captions) == captions_per_image

                # Read images
                img = cv2.imread(path)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                if len(img.shape) == 2:
                    img = img[:, :, np.newaxis]
                    img = np.concatenate([img, img, img], axis=2)
                #img = imresize(img, (256, 256))
                #img = img.resize((256, 256))
                img = cv2.resize(img, (256, 256))
                img = img.transpose(2, 0, 1)
                assert img.shape == (3, 256, 256)
                assert np.max(img) <= 255

                # Save image to HDF5 file
                images[i] = img
            # Sanity check
            #assert images.shape[0] * captions_per_image == len(enc_captions) == len(caplens)
    return None
Exemplo n.º 4
0
    logger = logging.getLogger(__name__)

    print("Using arguments:")
    for k, v in args.items():
        print("Arg: %s: %s" % (k, v))

    args = vars(parser.parse_args())

    # Load Model and extract input/output layers
    keras_model = load_model_from_disk(args['model'])

    input_names = keras_model.input_names
    output_names = keras_model.output_names

    label_mapping = read_json(args['class_mapping_json'])
    pre_processing = read_json(args['pre_processing_json'])
    estimator = model_to_estimator(
        keras_model,
        model_dir=args['estimator_save_dir'])

    def decode_and_process_image(image):
        image = tf.image.decode_jpeg(image, channels=3)
        image = preprocess_image(image, **pre_processing)
        return image

    def generate_dataset_iterator(image_list):
        """ Dataset Iterator from a list of Image Bytes """
        dataset = tf.data.Dataset.from_tensor_slices(image_list)
        dataset = dataset.map(decode_and_process_image)
        dataset = dataset.batch(128)
Exemplo n.º 5
0
from transformers import AutoTokenizer

from data.utils import read_json
from src.models import get_encoder_decoder
from src.datasets import CaptionDataset
from src.utils import *
from src.word_map_utils import get_wp_tokenizer

checkpoint_dir = './ckpts/v24_bigmodel_mid_heavydropout_1024embed'
data_folder = './data/meta_wstyle/data_mid_clean_wonumber_wemojis_wp'
data_name = 'flickr8k_1_cap_per_img_1_min_word_freq'
checkpoint_file = os.path.join(
    checkpoint_dir, 'checkpoint_flickr8k_1_cap_per_img_1_min_word_freq.pth')
word_map_file = f'{data_folder}/WORDMAP_{data_name}.json'

word_map = read_json(word_map_file)
rev_word_map = {v: k for k, v in word_map.items()}
emoji_set = [
    w for w in word_map.keys() if w.startswith(':') and w.endswith(':')
]
vocab_size = len(word_map)

cfg_path = os.path.join(checkpoint_dir, 'config.json')
cfg = read_json(cfg_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder, decoder = get_encoder_decoder(cfg)
checkpoint = torch.load(checkpoint_file)
encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])
encoder.to(device)
Exemplo n.º 6
0
MAGNITUDE = '/home/alex/.magnitude/glove.twitter.27B.200d.magnitude'


def build_embedding_weight(word_map, embed_dim=200):
    vocab_size = len(word_map)
    weight = np.zeros((vocab_size, embed_dim))

    vectors = Magnitude(MAGNITUDE)
    for word, idx in tqdm(word_map.items()):
        word_embed = vectors.query(word)
        weight[idx, :] = word_embed
    return weight


if __name__ == '__main__':
    word_map = './data/meta_wstyle/data_mid_clean_wonumber/WORDMAP_flickr8k_1_cap_per_img_1_min_word_freq.json'
    npy_file = './pretrained/embedding.npy'

    word_map = read_json(word_map)
    weight = build_embedding_weight(word_map)
    with open(npy_file, 'wb') as f:
        np.save(f, weight)
    print(f'pretrained word embedding written: {npy_file} ({weight.shape})')

    # rev_word_map = {v: k for k, v in word_map.items()}
    # test_word_map = dict()
    # for i in range(50):
    #     word = rev_word_map[i]
    #     idx = word_map[word]
    #     test_word_map[word] = idx
import os

from data.utils import read_json
from src.word_map_utils import create_word_map_from_simple
from src.word_map_utils import create_word_map_from_pretrained_wordpiece

ig_json = './data/ig_json/full_clean_wonumber.json'
output_folder = './tests'
data = read_json(ig_json)


def test_create_word_map_from_pretrained_wordpiece():
    word_map, all_captions = create_word_map_from_pretrained_wordpiece(
        data,
        'wordpiece',
        output_folder=output_folder,
        min_word_freq=5,
        max_len=50,
        vocab_size=8000)
    return word_map, all_captions


def test_create_word_map_from_simple():
    word_map, all_captions = create_word_map_from_simple(
        data,
        'simple',
        output_folder=output_folder,
        min_word_freq=5,
        max_len=50,
        vocab_size=None)
    return word_map, all_captions