Exemplo n.º 1
0
def visualization_result(final_embeddings_path, reverse_dictionary_path,
                         output_image, num_plot):
    def plot_with_labels(low_dim_embs, labels, filename, fonts):
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y, s=18)
            plt.annotate(label,
                         fontproperties=fonts,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')
        plt.savefig(filename)

    final_embeddings = yaml_utils.read(final_embeddings_path)
    reverse_dictionary = yaml_utils.read(reverse_dictionary_path)
    # 为了在图片上能显示出中文
    font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=18)
    tsne = TSNE(perplexity=30,
                n_components=2,
                init='pca',
                n_iter=5000,
                method='exact')
    low_dim_embs = tsne.fit_transform(final_embeddings[:num_plot, :])
    labels = [reverse_dictionary[i] for i in range(num_plot)]
    plot_with_labels(low_dim_embs, labels, output_image, fonts=font)
Exemplo n.º 2
0
    def __init__(self, dataset_name, batch_size, valid_size, valid_window,
                 name, checkpoint_dir, embedding_size, num_sampled, num_epoch,
                 sess, tag, **args):
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.dataset_info = yaml_utils.read(args['dataset']['path'])
        self.vocabulary_size = self.dataset_info['vocabulary_size']
        self.reverse_dictionary = yaml_utils.read(
            self.dataset_info['reverse_dictionary'])
        self.train_data_generator = generate_batch(self.dataset_info['data'],
                                                   **args['dataset'])

        # We pick a random validation set to sample nearest neighbors. Here we limit the
        # validation samples to the words that have a low numeric ID, which by
        # construction are also the most frequent.
        self.valid_size = valid_size
        self.valid_examples = np.array(
            random.sample(range(valid_window), valid_size))

        self.name = name
        self.checkpoint_dir = Path(
            checkpoint_dir) / self.dataset_name / self.name / tag
        self.embedding_size = embedding_size
        self.num_sampled = num_sampled

        self.num_epoch = num_epoch
        self.valid_freq = 10000
        self.sess = sess
        self.tag = tag
        self.kwargs = args
        self.build_network()
        self.saver = tf.train.Saver()
Exemplo n.º 3
0
def most_similar(final_embeddings_path, reverse_dictionary_path, words):
    reverse_dictionary = yaml_utils.read(reverse_dictionary_path)
    final_embeddings = yaml_utils.read(final_embeddings_path)
    tf.set_random_seed(19)
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        valid_dataset = tf.placeholder(tf.int32, shape=[5])
        normalized_embeddings = tf.placeholder(tf.float32, shape=[5000, 128])
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                                  valid_dataset)
        similarity = tf.matmul(valid_embeddings,
                               normalized_embeddings,
                               transpose_b=True)
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        sim = sess.run(similarity,
                       feed_dict={
                           valid_dataset: words,
                           normalized_embeddings: final_embeddings
                       })
        for i in range(len(words)):
            valid_word = reverse_dictionary[words[i]]
            top_k = 5
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = '最接近 ' + valid_word + ' 的词语是:'
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str += str(k + 1) + ':' + close_word + ' '
            print(log_str)
Exemplo n.º 4
0
def data_loader():
    train_list = yaml_utils.read(str(output_path / 't22seg_train.yaml'))
    train_generator = data_generator(train_list, batch_size=6)

    test_list = yaml_utils.read(str(output_path / 't22seg_test.yaml'))
    test_generator = data_generator(test_list, batch_size=12)
    return train_generator, len(train_list), test_generator, len(test_list)
Exemplo n.º 5
0
    def __init__(self, sess, option):
        option = Option(option)
        self.sess = sess
        self.options = option
        self.print_freq = option.print_freq
        self.save_freq = option.save_freq

        self.batch_size = option.batch_size
        self.image_size = option.dataset.image_size
        self.in_channels = option.model.in_channels
        self.out_channels = option.model.out_channels
        self.L1_lambda = option.model.l1_lambda
        self.is_training = option.phase == 'train'
        self.train_dir = yaml_utils.read(option.dataset.train_path)
        self.test_dir = yaml_utils.read(option.dataset.test_path)

        self.generator = unet
        self.discriminator = patch_gan
        self.criterionGAN = lsgan_loss
        net_options = {
            'batch_size': self.batch_size,
            'image_size': self.image_size,
            'out_channels': self.out_channels,
            'G_channels': option.model.generator.channels,
            'D_channels': option.model.discriminator.channels,
            'is_training': self.is_training
        }
        self._build_model(net_options)
        self.saver = tf.train.Saver()
Exemplo n.º 6
0
def evaluate_model(args):
    tf.set_random_seed(19)
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        print('导入数据')
        dataset_info = yaml_utils.read(args['dataset']['path'])
        dictionary = yaml_utils.read(dataset_info['dictionary_path'])
        print('导入数据字典完成')
        reverse_dictionary = yaml_utils.read(
            dataset_info['reverse_dictionary_path'])
        print('导入反向字典完成')
        if 'embedding_path' in dataset_info.keys():
            embedding = np.array(yaml_utils.read(
                dataset_info['embedding_path']),
                                 dtype=np.float32)
        else:
            embedding = None
        print('导入词向量完成')
        test_dataset = yaml_utils.read(dataset_info['eval_path'])
        print('导入测试数据完成')
        print('导入完成')
        data_loader = get_data_loader_by_name(
            args['dataset']['data_generator'])
        eval_data_generator = data_loader(
            dictionary,
            False,
            test_dataset,
            batch_size=args['batch_size'],
            seq_length=args['dataset']['seq_length'],
            reverse_dictionary=reverse_dictionary)
        eval_data_generator.get_reverse_dictionary()
        model_class = get_model_class_by_name(args['model']['name'])
        model = model_class(sess=sess,
                            train_generator=None,
                            eval_generator=eval_data_generator,
                            embedding=embedding,
                            **dataset_info,
                            **args['dataset'],
                            **args['model'],
                            **args)
        result, labels = model.test()
        # yaml_utils.write(args['model']['checkpoint_dir'] + '/' + args['dataset']['dataset_name'] + '/' +
        #                  args['model']['name'] + '/' + args['tag'] + '/' + 'best_result.yaml', result)
        print('评估')
        print(
            metrics.classification_report(
                labels, result, target_names=eval_data_generator.get_labels()))
        print('混淆矩阵')
        cm = metrics.confusion_matrix(labels, result)
        print(cm)
Exemplo n.º 7
0
def train(args):
    tf.set_random_seed(19)
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        print('导入数据')
        dataset_info = yaml_utils.read(args['dataset']['path'])
        dictionary = yaml_utils.read(dataset_info['dictionary_path'])
        print('导入数据字典完成')
        if 'embedding_path' in dataset_info.keys():
            embedding = np.array(yaml_utils.read(
                dataset_info['embedding_path']),
                                 dtype=np.float32)
        else:
            embedding = None
        # embedding = None  # test without embedding
        print('导入词向量完成')
        train_dataset = yaml_utils.read(dataset_info['train_path'])
        print('导入数据训练数据完成')
        eval_dataset = yaml_utils.read(dataset_info['eval_path'])
        print('导入数据验证数据完成')
        print('导入完成')
        data_loader = get_data_loader_by_name(
            args['dataset']['data_generator'])
        train_data_generator = data_loader(
            dictionary,
            True,
            train_dataset,
            batch_size=args['batch_size'],
            seq_length=args['dataset']['seq_length'])

        eval_data_generator = data_loader(
            dictionary,
            False,
            eval_dataset,
            batch_size=args['batch_size'],
            seq_length=args['dataset']['seq_length'])
        model_class = get_model_class_by_name(args['model']['name'])
        model = model_class(sess=sess,
                            train_generator=train_data_generator,
                            eval_generator=eval_data_generator,
                            embedding=embedding,
                            **dataset_info,
                            **args['dataset'],
                            **args['model'],
                            **args)
        model.train()
Exemplo n.º 8
0
def generate_batch(data_path, batch_size, num_skips, skip_window, **args):
    data = yaml_utils.read(data_path)
    batch = np.ndarray(shape=batch_size, dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    buffer.extend(data[0:span])
    data_index = span  # init buffer and data_index
    while True:
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index = (data_index + 1) % len(data)
        yield batch, labels
Exemplo n.º 9
0
import cv2
import numpy as np
from pathlib import Path
from PIL import Image

from utils import yaml_utils
from yolo3.model import YOLO3

if __name__ == '__main__':
    # dataset
    image_path = './dataset/yolo3/football/img2.jpg'
    data_shape = np.array((416, 416))  # multiple of 32, hw
    dataset_name = 'coco2017'
    anchors = np.array(yaml_utils.read('configs/yolo3/anchors.yaml'))
    classes = yaml_utils.read('dataset/yolo3/{}/classes.yaml'.format(dataset_name))
    num_layers = len(anchors) // 3  # Different detection scales   y1,y2,y3
    anchor_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]

    # model
    yolo3 = YOLO3(data_shape, classes, num_layers, anchor_mask, anchors, './_checkpoints/yolov3/')
    yolo3.build_eval_model(dataset_name, score_threshold=0.3, iou_threshold=0.45)
    image = cv2.imread(image_path)
    image = Image.fromarray(image)
    objects, _ = yolo3.detect_image(image)
    image = np.array(image)
    for obj in objects:
        bounding_box = obj['bounding_box']
        cv2.rectangle(image, (bounding_box[0], bounding_box[1]), (bounding_box[2], bounding_box[3]), (0, 0, 255),
                      thickness=2)
    output_dir = Path('./_results') / dataset_name
    output_dir.mkdir(exist_ok=True, parents=True)
Exemplo n.º 10
0
    buffer = collections.deque(maxlen=span)
    buffer.extend(data[0:span])
    data_index = span  # init buffer and data_index
    while True:
        for i in range(batch_size // num_skips):
            context_words = [w for w in range(span) if w != skip_window]
            words_to_use = random.sample(context_words, num_skips)
            for j, context_word in enumerate(words_to_use):
                batch[i * num_skips + j] = buffer[skip_window]
                labels[i * num_skips + j, 0] = buffer[context_word]
            if data_index == len(data):
                buffer.extend(data[0:span])
                data_index = span
            else:
                buffer.append(data[data_index])
                data_index = (data_index + 1) % len(data)
        yield batch, labels


if __name__ == '__main__':
    print('读取数据')
    dataset = yaml_utils.read('dataset/little_data.yaml')
    print('读取数据完毕')
    data = dataset['data']
    reverse_dictionary = dataset['reverse_dictionary']
    generator = generate_batch(data, 8, 2, 1)
    batch, labels = next(generator)
    for i in range(8):
        print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
              reverse_dictionary[labels[i, 0]])
Exemplo n.º 11
0
import cv2
import numpy as np
import xml.etree.ElementTree as ET
from pathlib import Path

from utils import yaml_utils

train_scale = 0.99
classes = {'ball': 0, 'messi': 1}
data_dir = '/home/yf/dataset/football/'# E:/Dataset/football/
bounding_boxes = yaml_utils.read(data_dir + 'dataset.yaml')
output_dir = Path('../../dataset/yolo3/football/')

dataset = list()
for item in bounding_boxes:
    image_path = data_dir + 'images/' + item['name'] + '.jpg'
    image = cv2.imread(image_path)
    image_shape = item['shape']
    objects = list()
    for _obj in item['bounding_box']:
        bndbox = {'xmin': _obj['shape'][0], 'ymin': _obj['shape'][1], 'xmax': _obj['shape'][2],
                  'ymax': _obj['shape'][3]}
        cv2.rectangle(image, (bndbox['xmin'], bndbox['ymin']), (bndbox['xmax'], bndbox['ymax']), (0, 0, 255),
                      thickness=2)
        objects.append({'class_id': classes[_obj['name']], 'bndbox': bndbox})
    dataset.append({'image_path': image_path, 'size': image_shape, 'objects': objects})
    # cv2.imshow('image', image)  # 展示图片
    # cv2.waitKey(1)
output_dir.mkdir(parents=True, exist_ok=True)
np.random.shuffle(dataset)
train_steps = int(len(dataset) * train_scale)
Exemplo n.º 12
0
        # one-hot 处理
        label_id = self.label_dictionary[source_label]
        label = np.zeros((len(self.label_dictionary)), dtype=np.float32)
        label[label_id] = 1.0
        return input, label

    def get_data_generator(self):
        batch_input = list()
        batch_label = list()
        while True:
            if self.is_augmented:
                np.random.shuffle(self.dataset_list)
            for item in self.dataset_list:
                transf_input, transf_label = self.transform_word(item['input'], item['label'])
                batch_input.append(transf_input)
                batch_label.append(transf_label)
                if len(batch_input) == self.batch_size:
                    yield np.array(batch_input), np.array(batch_label)
                    batch_input = list()
                    batch_label = list()


if __name__ == '__main__':
    dataset_info = yaml_utils.read('../dataset/aclImdb/info.yaml')
    dictionary = yaml_utils.read(dataset_info['dictionary_path'])
    train_dataset = yaml_utils.read(dataset_info['eval_path'])
    print('读取完毕')
    data_generator = MultipleFileDataGenerator(dictionary, True, train_dataset, 32, 600)
    batch_input, batch_label = next(data_generator.get_data_generator())
    print(11)
Exemplo n.º 13
0
            batch_images.append(image)
            for l in range(num_layers):
                batch_labels[l].append(labels[l])
            if len(batch_images) == batch_size:
                batch_labels = [
                    np.array(batch_labels[l]) for l in range(num_layers)
                ]
                yield [np.array(batch_images),
                       *batch_labels], np.zeros(batch_size)
                batch_images = []
                batch_labels = [list() for _ in range(num_layers)]


if __name__ == '__main__':
    dataset_name = 'voc2012'
    anchors = np.array(yaml_utils.read('../configs/yolo3/anchors.yaml'))
    classes = yaml_utils.read(
        '../dataset/yolo3/{}/classes.yaml'.format(dataset_name))
    train_dataset = yaml_utils.read(
        '../dataset/yolo3/{}/train_dataset.yaml'.format(dataset_name))
    eval_dataset = yaml_utils.read(
        '../dataset/yolo3/{}/eval_dataset.yaml'.format(dataset_name))
    data_shape = np.array((416, 416))  # multiple of 32, hw
    batch_size = 3

    # dataset
    train_generator = data_generator(train_dataset, batch_size, data_shape,
                                     len(classes), anchors, True)
    train_value = next(train_generator)
    eval_generator = data_generator(eval_dataset, batch_size, data_shape,
                                    len(classes), anchors, False)
Exemplo n.º 14
0
def get_config(path):
    _dict = yaml_utils.read(path)
    return _dict
Exemplo n.º 15
0
def get_config(name):
    base_config = yaml_utils.read('configs/chinese.yaml')
    config_ = dict_update(base_config,
                          yaml_utils.read('configs/' + name + '.yaml'))
    return config_
Exemplo n.º 16
0
def get_config(name):
    config_ = yaml_utils.read('configs/' + name + '.yaml')
    return config_
Exemplo n.º 17
0
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        sim = sess.run(similarity,
                       feed_dict={
                           valid_dataset: words,
                           normalized_embeddings: final_embeddings
                       })
        for i in range(len(words)):
            valid_word = reverse_dictionary[words[i]]
            top_k = 5
            nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            log_str = '最接近 ' + valid_word + ' 的词语是:'
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log_str += str(k + 1) + ':' + close_word + ' '
            print(log_str)


if __name__ == '__main__':
    dataset_name = 'text8'
    model_name = 'SkipGram'
    tag = 'base'
    dataset_info = yaml_utils.read('dataset/' + dataset_name + '/info.yaml')
    checkpoint_dir = './_checkpoint/' + dataset_name + '/' + model_name + '/' + tag + '/'
    # visualization_result(checkpoint_dir + 'final_embeddings.yaml', dataset_info['reverse_dictionary'],
    #                      checkpoint_dir + 'tsne.png', 100)

    most_similar(checkpoint_dir + 'final_embeddings.yaml',
                 dataset_info['reverse_dictionary'], [1, 8, 34, 96, 144])
    # 62, 77, 95, 155, 187 wiki_corpus