def process_data_greyscale(args):
    files=['data_batch_1',  'data_batch_2',  'data_batch_3',  'data_batch_4',  'data_batch_5',  'test_batch']

    for file in files:
        labels, images = unpickle(os.path.join(args.data_path,file))
        num_examples=images.shape[0]
        images = 0.21 * images[:, :, :, 0] + 0.72 * images[:, :, :, 1] + 0.07 * images[:, :, :, 2]
        images = np.asarray(images, dtype=np.int32)
        print num_examples
        try:
            writer.close()
        except:
            pass
        writer = tf.python_io.TFRecordWriter(os.path.join(args.target_path, file + '.tfrecord'))

        for index in range(num_examples):
            label = int(labels[index])

            image_raw = imresize(images[index], [args.size, args.size]).tostring()
            example = tf.train.Example(features=tf.train.Features(feature={
                'height': tfrecord_utils.int64_feature([args.size]),
                'width': tfrecord_utils.int64_feature([args.size]),
                'depth': tfrecord_utils.int64_feature([1]),
                'label': tfrecord_utils.int64_feature([label]),
                'image_raw': tfrecord_utils.bytes_feature([image_raw])}))
            writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
Exemplo n.º 2
0
def write_to_tfrecords(filename, destination_dir, responses, prompts, q_ids, grades, speakers, targets, predictions,
                       debug=False):
    # Check that all the input lists are of equal lengths
    assert len({len(responses), len(prompts), len(q_ids), len(grades), len(speakers), len(targets), len(predictions)}) == 1

    # Create the training TF Record file
    print('Writing: ', filename)

    writer = tf.python_io.TFRecordWriter(os.path.join(destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt, example_pred, idx in zip(responses, prompts, q_ids, grades, speakers, targets, predictions, range(len(q_ids))):
        example = tf.train.SequenceExample(
            context=tf.train.Features(feature={
                'targets': tfrecord_utils.float_feature([tgt]),
                'grade': tfrecord_utils.float_feature([float(grd)]),
                'teacher_pred': tfrecord_utils.float_feature(list(example_pred)),
                'spkr': tfrecord_utils.bytes_feature([spkr]),
                'q_id': tfrecord_utils.int64_feature([q_id]),
                'example_idx': tfrecord_utils.int64_feature([idx])  # Stores the example number for easy back-reference to txt files even when examples get shuffled (0 indexed)
            }),
            feature_lists=tf.train.FeatureLists(feature_list={
                'response': tfrecord_utils.int64_feature_list(response),
                'prompt': tfrecord_utils.int64_feature_list(prompt)}))
        if debug:
            # Print out the data that is going to be saved:
            print("-----------------\n", "EXAMPLE: \n", "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n,teacher_pred: {}\nexample_num: {}\n\n".format(response, prompt, q_id, tgt, grd, example_pred, idx))
        writer.write(example.SerializeToString())
    writer.close()
    return
def process_data(args):
    if args.cifar100 == True:
        files = ['test', 'train']
    else:
        files=['data_batch_1',  'data_batch_2',  'data_batch_3',  'data_batch_4',  'data_batch_5',  'test_batch']

    for file in files:
        labels, images = unpickle(os.path.join(args.data_path,file))
        num_examples=images.shape[0]
        print num_examples

        for index in range(num_examples):
            label = int(labels[index])
            if index % 10000 == 0:
                print index
                try:
                    writer.close()
                except:
                    pass
                writer = tf.python_io.TFRecordWriter(os.path.join(args.target_path, file + '_'+ str(index/10000)+'.tfrecord'))
            image_raw = images[index].tostring()
            example = tf.train.Example(features=tf.train.Features(feature={
                'height': tfrecord_utils.int64_feature([32]),
                'width': tfrecord_utils.int64_feature([32]),
                'depth': tfrecord_utils.int64_feature([3]),
                'label': tfrecord_utils.int64_feature([label]),
                'image_raw': tfrecord_utils.bytes_feature([image_raw])}))
            writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
Exemplo n.º 4
0
def process_data(args):
    files = os.listdir(os.path.join(args.data_path, 'images'))
    num_examples = len(files)
    #Make sure there is a good global shuffle
    random.shuffle(files)
    for file, i in zip(files, range(num_examples)):
        file_path = os.path.join(args.data_path, 'images/' + file)
        if i % 5000 == 0:
            print i
            try:
                writer.close()
            except:
                pass
            writer = tf.python_io.TFRecordWriter(
                os.path.join(args.target_path,
                             'LSUN' + '_' + str(i / 5000) + '.tfrecord'))
        if os.path.isfile(file_path) and os.stat(file_path).st_size != 0:
            im = Image.open(file_path)
            im = im.resize((args.size, args.size), resample=Image.BICUBIC)
            image_raw = im.convert("RGB").tostring("raw", "RGB")

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'height': tfrecord_utils.int64_feature([args.size]),
                    'width': tfrecord_utils.int64_feature([args.size]),
                    'depth': tfrecord_utils.int64_feature([3]),
                    'label': tfrecord_utils.int64_feature([-1]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])
                }))
            writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
Exemplo n.º 5
0
def process_data(args):
    data = np.loadtxt(args.data_path)
    data_X = data[:, :256]
    data_y = np.argmax(np.asarray(data[:, 256:], dtype=np.int32), axis=1)
    data_X = np.reshape(data_X, [-1, 16, 16])
    num_examples = data.shape[0]
    print num_examples
    for index in range(num_examples):
        if index % 10000 == 0:
            try:
                writer.close()
            except:
                pass
            writer = tf.python_io.TFRecordWriter(
                os.path.join(args.target_path,
                             'semeion_' + str(index / 10000) + '.tfrecord'))

        image_raw = imresize(data_X[index], size=[28, 28]).tostring()
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'height': tfrecord_utils.int64_feature([28]),
                'width': tfrecord_utils.int64_feature([28]),
                'depth': tfrecord_utils.int64_feature([1]),
                'label': tfrecord_utils.int64_feature([data_y[index]]),
                'image_raw': tfrecord_utils.bytes_feature([image_raw])
            }))
        writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
Exemplo n.º 6
0
def process_data(data, args):
    sets = ['train', 'valid', 'test']
    for set in sets:
        images = data[set].images
        labels = data[set].labels
        num_examples = data[set].num_examples

        for index in range(num_examples):
            label = int(labels[index])
            if index % 10000 == 0:
                try:
                    writer.close()
                except:
                    pass
                writer = tf.python_io.TFRecordWriter(
                    os.path.join(args.target_path,
                                 set + '_' + str(index / 10000) + '.tfrecord'))

            image_raw = images[index].tostring()
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'height': tfrecord_utils.int64_feature([28]),
                    'width': tfrecord_utils.int64_feature([28]),
                    'depth': tfrecord_utils.int64_feature([1]),
                    'label': tfrecord_utils.int64_feature([label]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])
                }))
            writer.write(example.SerializeToString())
    try:
        writer.close()
    except:
        pass
def process_data_XVAL(args):

    files = [
        'test_32x32.mat',
        'train_32x32.mat',
        'extra_32x32.mat',
    ]

    for file in files:
        data_y, data_X = load_svhn_mat(os.path.join(args.data_path, file))
        set = file.split('.')[0]
        max_digits = np.max(data_y)

        num_examples = data_X.shape[0]
        for fold in range(max_digits + 1):
            print 'Fold', fold
            fname = 'fold_' + str(fold)
            path = os.path.join(args.target_path, fname)
            if not os.path.isdir(path):
                os.makedirs(path)
            for index in range(num_examples):
                label = int(data_y[index])
                if label > fold:
                    label -= 1
                elif label == fold:
                    label = max_digits
                if index % 42660 == 0:
                    try:
                        writer_seen.close()
                        writer_unseen.close()
                    except:
                        pass
                    writer_seen = tf.python_io.TFRecordWriter(
                        os.path.join(
                            path, set + '_' + fname + '_' +
                            str(index / 42660) + '.tfrecord'))
                    writer_unseen = tf.python_io.TFRecordWriter(
                        os.path.join(
                            path, set + '_heldout_' + fname + '_' +
                            str(index / 42660) + '.tfrecord'))

                image_raw = data_X[index].tostring()
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'height': tfrecord_utils.int64_feature([32]),
                        'width': tfrecord_utils.int64_feature([32]),
                        'depth': tfrecord_utils.int64_feature([3]),
                        'label': tfrecord_utils.int64_feature([label]),
                        'image_raw': tfrecord_utils.bytes_feature([image_raw])
                    }))
                if int(data_y[index]) == fold:
                    writer_unseen.write(example.SerializeToString())
                else:
                    writer_seen.write(example.SerializeToString())

    try:
        writer_seen.close()
        writer_unseen.close()
    except:
        pass
def process_data(args):
    sets=['train', 'val', 'test']
    index={}
    with open(os.path.join(args.data_path, 'index.txt'), 'r') as f:
        ind_list= [line[:-1].split() for line in f.readlines()]
    for i in xrange(len(ind_list)):
        index[ind_list[i][0]] = {'class' : i, 'name' : ind_list[i][1]}

    for set in sets:
        print set
        files = os.listdir(os.path.join(args.data_path,set+'/images/'))
        num_examples=len(files)
        #Make sure there is a good global shuffle
        random.shuffle(files)
        if set == 'val':
            with open(os.path.join(args.data_path, 'val/val_index.txt'), 'r') as f:
                val_list = [line[:-1].split() for line in f.readlines()]
            val_index={}
            for item in val_list:
                val_index[item[0]]= item[1]
        for file, i in zip(files, range(num_examples)):
            file_path = os.path.join(args.data_path, set+'/images/'+file)
            if set == 'train':
                label = index[file.split('_')[0]]['class']
                name = index[file.split('_')[0]]['name']
            elif set == 'val':
                code = val_index[file]
                label = index[code]['class']
                name = index[code]['name']
            else:
                label = -1
                name = 'NA'

            if i % 10000 == 0:
                print i
                try:
                    writer.close()
                except:
                    pass
                writer = tf.python_io.TFRecordWriter(os.path.join(args.target_path, set + '_' + str(i/10000) + '.tfrecord'))
            if os.path.isfile(file_path) and os.stat(file_path).st_size != 0:
                im = Image.open(file_path)
                im = im.resize((args.size, args.size), resample=Image.BICUBIC)
                image_raw = im.convert("RGB").tostring("raw", "RGB")

                example = tf.train.Example(features=tf.train.Features(feature={
                    'height': tfrecord_utils.int64_feature([args.size]),
                    'width': tfrecord_utils.int64_feature([args.size]),
                    'depth': tfrecord_utils.int64_feature([3]),
                    'label': tfrecord_utils.int64_feature([label]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])}))
                writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
Exemplo n.º 9
0
def process_data_XVAL(data, args):
    sets = ['train', 'valid', 'test']
    for set in sets:
        images = data[set].images
        labels = data[set].labels
        num_examples = data[set].num_examples

        for fold in xrange(10):
            print 'Fold', fold
            name = 'fold_' + str(fold)
            path = os.path.join(args.target_path, name)
            if not os.path.isdir(path):
                os.makedirs(path)
            for index in range(num_examples):
                label = int(labels[index])
                if index % 10000 == 0:
                    try:
                        writer_seen.close()
                        writer_unseen.close()
                    except:
                        pass
                    writer_seen = tf.python_io.TFRecordWriter(
                        os.path.join(
                            path, set + '_' + name + '_' + str(index / 10000) +
                            '.tfrecord'))
                    writer_unseen = tf.python_io.TFRecordWriter(
                        os.path.join(
                            path, set + '_heldout_' + name + '_' +
                            str(index / 10000) + '.tfrecord'))

                if label > fold:
                    label -= 1
                elif label == fold:
                    label = 9
                image_raw = images[index].tostring()
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'height': tfrecord_utils.int64_feature([28]),
                        'width': tfrecord_utils.int64_feature([28]),
                        'depth': tfrecord_utils.int64_feature([1]),
                        'label': tfrecord_utils.int64_feature([label]),
                        'image_raw': tfrecord_utils.bytes_feature([image_raw])
                    }))

                if int(labels[index]) == fold:
                    writer_unseen.write(example.SerializeToString())
                else:
                    writer_seen.write(example.SerializeToString())
    try:
        writer_seen.close()
        writer_unseen.close()
    except:
        pass
def process_data_greyscale(args):
    files = [
        'test_32x32.mat',
        'train_32x32.mat',
        'extra_32x32.mat',
    ]

    for file in files:
        data_y, data_X = load_svhn_mat(os.path.join(args.data_path, file))
        data_X = 0.21 * data_X[:, :, :,
                               0] + 0.72 * data_X[:, :, :,
                                                  1] + 0.07 * data_X[:, :, :,
                                                                     2]
        data_X = np.asarray(data_X, dtype=np.int32)
        print data_X.shape
        fname = file.split('.')[0]

        num_examples = data_X.shape[0]
        print num_examples
        for index in range(num_examples):
            label = int(data_y[index])
            if index % 10000 == 0:
                try:
                    writer.close()
                except:
                    pass
                writer = tf.python_io.TFRecordWriter(
                    os.path.join(
                        args.target_path,
                        fname + '_gs_' + str(index / 10000) + '.tfrecord'))

            image_raw = imresize(data_X[index],
                                 [args.size, args.size]).tostring()
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'height': tfrecord_utils.int64_feature([args.size]),
                    'width': tfrecord_utils.int64_feature([args.size]),
                    'depth': tfrecord_utils.int64_feature([1]),
                    'label': tfrecord_utils.int64_feature([label]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])
                }))
            writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
def process_data_XVAL(args):
    files = ['data_batch_1', 'data_batch_2', 'data_batch_3', 'data_batch_4', 'data_batch_5', 'test_batch']

    for file in files:
        labels, images = unpickle(os.path.join(args.data_path,file))
        max_digits=np.max(labels)

        num_examples=images.shape[0]
        for fold in range(max_digits+1):
            print 'Fold', fold
            fname='fold_'+str(fold)
            path = os.path.join(args.target_path, fname)
            if not os.path.isdir(path):
                os.makedirs(path)
            for index in range(num_examples):
                label = int(labels[index])
                if label > fold:
                    label -= 1
                elif label == fold:
                    label = max_digits
                try:
                    writer_seen.close()
                    writer_unseen.close()
                except:
                    pass
                writer_seen = tf.python_io.TFRecordWriter(os.path.join(path,  file+'_'+fname +'.tfrecord'))
                writer_unseen = tf.python_io.TFRecordWriter(os.path.join(path, file + '_heldout_' +  fname + '.tfrecord'))

                image_raw = images[index].tostring()
                example = tf.train.Example(features=tf.train.Features(feature={
                    'height': tfrecord_utils.int64_feature([32]),
                    'width': tfrecord_utils.int64_feature([32]),
                    'depth': tfrecord_utils.int64_feature([3]),
                    'label': tfrecord_utils.int64_feature([label]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])}))
                if int(labels[index]) == fold:
                    writer_unseen.write(example.SerializeToString())
                else:
                    writer_seen.write(example.SerializeToString())

    try:
        writer_seen.close()
        writer_unseen.close()
    except:
        pass
def process_data(args):
    files = [
        'test_32x32.mat',
        'train_32x32.mat',
        'extra_32x32.mat',
    ]

    for file in files:
        data_y, data_X = load_svhn_mat(os.path.join(args.data_path, file))
        fname = file.split('.')[0]

        num_examples = data_X.shape[0]
        print num_examples
        for index in range(num_examples):
            label = int(data_y[index])
            if index % 10000 == 0:
                try:
                    writer.close()
                except:
                    pass
                writer = tf.python_io.TFRecordWriter(
                    os.path.join(
                        args.target_path,
                        fname + '_' + str(index / 10000) + '.tfrecord'))

            image_raw = data_X[index].tostring()
            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'height': tfrecord_utils.int64_feature([32]),
                    'width': tfrecord_utils.int64_feature([32]),
                    'depth': tfrecord_utils.int64_feature([1]),
                    'label': tfrecord_utils.int64_feature([label]),
                    'image_raw': tfrecord_utils.bytes_feature([image_raw])
                }))
            writer.write(example.SerializeToString())

    try:
        writer.close()
    except:
        pass
def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if os.path.isdir(args.destination_dir):
        print 'destination directory exists. Exiting...'
    else:
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

        # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the prompts as sequences of words
    with open(args.input_prompt_path, 'r') as file:
        topics = [line.replace('\n', '') for line in file.readlines()]

    # Get unique set of topics and topic counts (and sort tem)
    unique_topics, topic_counts = np.unique(topics, return_counts=True)
    topics = unique_topics[np.flip(np.argsort(topic_counts), 0)]
    topic_counts = np.flip(np.sort(topic_counts), 0)

    # Create dictionary for topics mapping sentence to topic id
    # Also create file of sorted topics and unigrams file
    # Unigram file later used for training
    topic_dict = {}
    with open(os.path.join(args.destination_dir, 'unigrams.txt'),
              'w') as ufile:
        with open(os.path.join(args.destination_dir, 'sorted_topics.txt'),
                  'w') as tfile:
            for i, topic, count in zip(xrange(topics.shape[0]), topics,
                                       topic_counts):
                topic_dict[topic] = i
                ufile.write(str(i) + ',' + str(int(count)) + '\n')
                tfile.write(topic + '\n')

    # Load up the speakers and speakers
    grades = np.loadtxt(args.input_grade_path)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Create a list of topic IDs for every response
    with open(args.input_prompt_path, 'r') as file:
        q_ids = np.asarray(
            [topic_dict[line.replace('\n', '')] for line in file.readlines()])

    ### Split data into train and validation  data sets
    n = len(responses)
    train_size = int(n * (1.0 - args.valid_fraction))
    valid_size = n - train_size

    print 'Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size

    np.random.seed(1000)

    permutation = np.random.choice(np.arange(n), n, replace=False)
    index_train = permutation[:train_size]
    inded_valid = permutation[train_size:]

    trn_responses = responses[index_train]
    trn_prompts = prompts[index_train]
    trn_q_ids = q_ids[index_train]
    trn_speakers = speakers[index_train]
    trn_grades = grades[index_train]

    valid_responses = responses[inded_valid]
    valid_prompts = prompts[inded_valid]
    valid_q_ids = q_ids[inded_valid]
    valid_speakers = speakers[inded_valid]
    valid_grades = grades[inded_valid]

    # Create the training TF Record file
    filename = 'relevance.train.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(trn_responses, trn_prompts,
                                                 trn_q_ids, trn_grades,
                                                 trn_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()

    # Create the validation TF Record file
    filename = 'relevance.valid.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(valid_responses,
                                                 valid_prompts, valid_q_ids,
                                                 valid_grades, valid_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if not os.path.isdir(args.destination_dir):
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_test_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the grades, targets and speakers
    grades = np.loadtxt(args.input_grade_path)
    targets = np.loadtxt(args.input_tgt_path, dtype=np.float32)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Load up sorted topics and (re)construct the topic dict so that I map each prompt word sequence to its q_id
    topic_dict = {}
    i = 0
    with open(os.path.join(args.sorted_topics_path), 'r') as tfile:
        for topic in tfile.readlines():
            topic_dict[topic.replace('\n', '')] = i
            i += 1

    # Load up the prompts as sequences of words and convert to q_id
    try:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([
                topic_dict[line.replace('\n', '')]
                for line in file.readlines()
            ])
    except:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([-1 for line in file.readlines()])

    # Create the training TF Record file
    filename = args.name + '.tfrecords'
    print 'Writing', filename

    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts,
                                                      q_ids, grades, speakers,
                                                      targets):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([tgt]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
Exemplo n.º 15
0
def write_to_tfrecords(filename,
                       destination_dir,
                       responses,
                       prompts,
                       q_ids,
                       grades,
                       speakers,
                       targets=1.0,
                       debug=False):
    # Check that all the input lists are of equal lengths

    # TEMP
    print(len(responses))
    print(len(prompts))
    print(len(q_ids))
    print(len(grades))
    print(len(speakers))

    assert len(
        {len(responses),
         len(prompts),
         len(q_ids),
         len(grades),
         len(speakers)}) == 1

    if type(targets) is float or type(targets) is int:
        # If targets is an integer make each target this value
        targets = [float(targets)] * len(responses)
    else:
        assert type(targets) is list
        assert len(targets) == len(responses)

    # Create the training TF Record file
    print('Writing: ', filename)

    writer = tf.python_io.TFRecordWriter(
        os.path.join(destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts,
                                                      q_ids, grades, speakers,
                                                      targets):
        if debug:
            # Print out the data that is going to be saved:
            print(
                "-----------------\n", "EXAMPLE: \n",
                "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n\n"
                .format(response, prompt, q_id, tgt, grd))
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([tgt]),
                    'grade': tfrecord_utils.float_feature([float(grd)]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
    return
def main(argv=None):
    args = commandLineParser.parse_args()
    if not os.path.isdir('CMDs'):
        os.mkdir('CMDs')
    with open('CMDs/step_process_omniglot_data.txt', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    if not os.path.isdir(args.target_path):
        os.makedirs(args.target_path)

    collage = np.zeros(shape=[4 * args.size, 8 * args.size])
    dirs = os.listdir(args.data_path)
    len_dirs = len(dirs)
    for item, j in zip(dirs, xrange(len_dirs)):
        if j % 15000 == 0:
            try:
                writer.close()
            except:
                pass
            writer = tf.python_io.TFRecordWriter(
                os.path.join(args.target_path,
                             'omniglot_' + str(j / 15000) + '.tfrecord'))
            print j
        img_file = os.path.join(args.data_path, item)
        if os.path.isfile(img_file) and os.stat(img_file).st_size != 0:
            try:
                im = Image.open(img_file)
                width, height = im.size
                size = np.min([width, height])
                if size < args.size:
                    continue
                imResize = im.resize((args.size, args.size),
                                     resample=Image.NEAREST)
                imResize = np.array(imResize.getdata(),
                                    dtype=np.uint8).reshape(
                                        args.size, args.size)
                if j < 32:
                    i = j % 8
                    k = j / 8
                    collage[k * args.size:(k + 1) * args.size,
                            i * args.size:(i + 1) * args.size] = imResize
                elif j == 32:
                    fig = plt.imshow(np.asarray(collage, dtype=np.uint8),
                                     cmap='gray')
                    fig.axes.get_xaxis().set_visible(False)
                    fig.axes.get_yaxis().set_visible(False)
                    path = os.path.join(args.target_path, 'omniglot.png')
                    plt.savefig(path, bbox_inches='tight')
                    plt.close()

                imResize = np.reshape(imResize, (args.size * args.size))
                imResize_raw = imResize.tostring()
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'height': tfrecord_utils.int64_feature([args.size]),
                        'width': tfrecord_utils.int64_feature([args.size]),
                        'depth': tfrecord_utils.int64_feature([1]),
                        'label': tfrecord_utils.int64_feature([-1]),
                        'image_raw': tfrecord_utils.bytes_feature(
                            [imResize_raw])
                    }))
                writer.write(example.SerializeToString())
            except:
                with open('errors', 'a') as handle:
                    handle.write(item + '\n')
                    print 'here'
    writer.close()