def process_folder(all_png, out_dir):
    label_dict = utils.get_label_dict()
    folders = utils.get_ordered_folders()
    val_ground_dict = utils.get_val_ground_dict()

    labels_searched = []
    for folder in folders:
        labels_searched.append(label_dict[folder])
    labels_list = []
    images = []
    for image_index, image_name in enumerate(all_png):
        if image_index % 1000 == 0:
            Tools.print("{} {}".format(image_index, len(all_png)))

        basename = os.path.basename(image_name)
        label = val_ground_dict[basename[:-4]]
        if label not in labels_searched:
            continue
        try:
            img = imageio.imread(image_name)
            r = img[:, :, 0].flatten()
            g = img[:, :, 1].flatten()
            b = img[:, :, 2].flatten()
        except:
            Tools.print('Cant process image {}'.format(basename))
            with open("log_img2np_val.txt", "a") as f:
                f.write("Couldn't read: {}".format(image_name))
            continue
        arr = np.array(list(r) + list(g) + list(b), dtype=np.uint8)
        images.append(arr)
        labels_list.append(label)
        pass

    data_val = np.row_stack(images)

    # Can add some kind of data splitting
    d_val = {'data': data_val, 'labels': labels_list}
    Tools.new_dir(out_dir)
    pickle.dump(d_val, open(os.path.join(out_dir, 'val_data'), 'wb'))

    y_test = d_val['labels']
    count = np.zeros([1000])

    for i in y_test:
        count[i - 1] += 1
    for i in range(1000):
        Tools.print('%d : %d' % (i, count[i]))
    Tools.print('SUM: %d' % len(y_test))
    pass
    train_df.to_csv(args.data_path + '/train.csv', index=False)
    val_df.to_csv(args.data_path + '/val.csv', index=False)
    test_df.to_csv(args.data_path + '/test.csv', index=False)
    # Load vocab and label from file, if not exist create from data.
    if os.path.exists(args.data_path + '/' + args.vocab_name) is True:
        seq_vocab = TokenDictionary.load(args.data_path + '/' +
                                         args.vocab_name)
    else:
        seq_vocab = build_vocab(train_df['text'].tolist())
        seq_vocab.save(args.data_path + '/' + args.vocab_name, delimiter='\t')

    if os.path.exists(args.data_path + '/' + args.intent_name) is True:
        label_dict = Dictionary.load(args.data_path + '/' + args.intent_name)
    else:
        label_dict = get_label_dict(
            pd.concat([train_df, test_df, val_df])['label'].tolist())
        label_dict.save(args.data_path + '/' + args.intent_name,
                        delimiter='\t')

    # Create train, val, test dataset.
    train_dataset = LabelDataset(train_df,
                                 seq_vocab,
                                 label_dict,
                                 multi_label=True,
                                 max_seq_len=args.max_seq_len)
    max_seq_len = train_dataset.max_len_seq

    val_dataset = LabelDataset(val_df,
                               seq_vocab,
                               label_dict,
                               multi_label=True,
예제 #3
0
import architecture
from architecture import VAE, VAEClassifier
from base64 import b64decode
from flask import Flask, jsonify, request
import io
from PIL import Image
import torch
import torchvision.transforms as T
import utils

app = Flask(__name__)

# Labels
LABELS = utils.get_label_dict()

# Model
device = torch.device('cpu')
vae = VAE(1024)
vae.load_state_dict(torch.load("./models/vae_epoch50.pth",
                               map_location=device))
model = VAEClassifier(vae, len(LABELS))
model.load_state_dict(
    torch.load("./models/classifier_epoch25.pth", map_location=device))
model.eval()


# Preprocess image
def preprocess(img_bytes):
    image = Image.open(io.BytesIO(img_bytes))
    img_transform = T.Compose([
        T.Resize(256),
def get_loaders(word_emb_file,
                train_file,
                dev_file,
                test_file,
                num_ents,
                clipping=None,
                no_rel=None):
    """ Prepares data and returns Pytorch data_loaders.
    It takes three filenames as arguments as well as the file with the pre-trained word embeddings.
    The script assumes that all relation types occur at least once in the training data.

    Data is first split in tokens and types of the entities.
    Data is truncated to max_len and padded if necessary.
    Labels are represented as categorical single numbers.
    """
    batch_size = 32

    train_tokens, train_types, train_labels = utils.middletokens_types_labels(
        train_file)
    dev_tokens, dev_types, dev_labels = utils.middletokens_types_labels(
        dev_file)
    test_tokens, test_types, test_labels = utils.middletokens_types_labels(
        test_file)

    train_ents, train_dict = utils.get_entity_set(train_file)
    dev_ents, _ = utils.get_entity_set(dev_file)
    test_ents, _ = utils.get_entity_set(test_file)

    ent2idx = utils.get_ent_voc(train_dict, num_ents)

    train_ents_idx = [ent2idx.get(entity, 0) for entity in train_ents]
    dev_ents_idx = [ent2idx.get(entity, 0) for entity in dev_ents]
    test_ents_idx = [ent2idx.get(entity, 0) for entity in test_ents]

    if no_rel != None:
        train_tokens, train_types, train_ents_idx, train_labels = no_rel_filter(
            train_tokens, train_types, train_ents_idx, train_labels)
        dev_tokens, dev_types, dev_ents_idx, dev_labels = no_rel_filter(
            dev_tokens, dev_types, dev_ents_idx, dev_labels)
        test_tokens, test_types, test_ents_idx, test_labels = no_rel_filter(
            test_tokens, test_types, test_ents_idx, test_labels)

    ###################################################################################################################
    ################## clipped data set with only top n entities, equal number of sentences of each entity
    if clipping != None:
        cut_off = len([
            x for x in train_ents if str(x) == list(ent2idx.keys())[list(
                ent2idx.values()).index(num_ents - 1)]
        ])  #freq of last topn item
        train_tokens, train_types, train_labels, train_ents_idx = create_topn_set(
            train_tokens, train_types, train_labels, train_ents_idx, cut_off)

    train_lens = [
        len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50
        for sent in train_tokens
    ]
    dev_lens = [
        len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50
        for sent in dev_tokens
    ]
    test_lens = [
        len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50
        for sent in test_tokens
    ]

    labels2idx = utils.get_label_dict(train_labels + dev_labels + test_labels)
    idx2labels = dict([(value, key) for key, value in labels2idx.items()])

    train_labels = [labels2idx[label] for label in train_labels]
    dev_labels = [labels2idx[label] for label in dev_labels]
    test_labels = [labels2idx[label] for label in test_labels]

    # Convert text and labels to matrix format.
    word_to_id, word_embedding_matrix = utils.vocab_and_vectors(
        word_emb_file, ['<PAD>', '<UNK>'])

    num_types = 20
    type_to_id = get_words_to_id_map(train_types, num_types, "types")

    maxlen = 50

    train_word_matrix = get_text_matrix(train_tokens, word_to_id, maxlen)
    train_type_matrix = get_text_matrix(train_types, type_to_id, 2)

    dev_word_matrix = get_text_matrix(dev_tokens, word_to_id, maxlen)
    dev_type_matrix = get_text_matrix(dev_types, type_to_id, 2)

    test_word_matrix = get_text_matrix(test_tokens, word_to_id, maxlen)
    test_type_matrix = get_text_matrix(test_types, type_to_id, 2)

    mask_matrix = utils.get_mapping_ent2rel(
        train_ents_idx + dev_ents_idx + test_ents_idx,
        train_labels + dev_labels + test_labels, len(labels2idx), num_ents)

    word_input_dim = word_embedding_matrix.shape[0]
    word_output_dim = word_embedding_matrix.shape[1]

    train_loader = convert2tensor(train_word_matrix, train_type_matrix,
                                  train_labels, train_lens, train_ents_idx,
                                  batch_size)
    valid_loader = convert2tensor(dev_word_matrix, dev_type_matrix, dev_labels,
                                  dev_lens, dev_ents_idx, batch_size)
    test_loader = convert2tensor(test_word_matrix, test_type_matrix,
                                 test_labels, test_lens, test_ents_idx,
                                 batch_size)

    return train_loader, valid_loader, test_loader, word_input_dim, word_output_dim, word_embedding_matrix, idx2labels, mask_matrix
def process_folder(in_dir, out_dir):
    label_dict = utils.get_label_dict()
    folders = utils.get_ordered_folders()
    data_list_train = []
    labels_list_train = []
    for folder in folders:
        label = label_dict[folder]
        Tools.print("Processing images from folder %s as label %d" %
                    (folder, label))
        images = []
        try:
            for image_name in os.listdir(os.path.join(in_dir, folder)):
                try:
                    img = imageio.imread(
                        os.path.join(in_dir, folder, image_name))
                    r = img[:, :, 0].flatten()
                    g = img[:, :, 1].flatten()
                    b = img[:, :, 2].flatten()
                except Exception:
                    Tools.print('Cant process image %s' % image_name)
                    with open("log_img2np.txt", "a") as f:
                        f.write("Couldn't read: {} \n".format(
                            os.path.join(in_dir, folder, image_name)))
                    continue
                arr = np.array(list(r) + list(g) + list(b), dtype=np.uint8)
                images.append(arr)
                pass

            data = np.row_stack(images)
            samples_num = data.shape[0]
            labels = [label] * samples_num

            labels_list_train.extend(labels)
            data_list_train.append(data)

            Tools.print('Label: %d: %s has %d samples' %
                        (label, folder, samples_num))
        except Exception:
            pass

    x = np.concatenate(data_list_train, axis=0)
    y = labels_list_train
    x_mean = np.mean(x, axis=0)
    train_indices = np.arange(x.shape[0])
    np.random.shuffle(train_indices)
    curr_index = 0
    size = x.shape[0] // 10

    y_test = []
    Tools.new_dir(out_dir)
    for i in range(1, 10):
        d = {
            'data':
            x[train_indices[curr_index:(curr_index + size)], :],
            'labels':
            np.array(y)[train_indices[curr_index:(curr_index +
                                                  size)]].tolist(),
            'mean':
            x_mean
        }
        pickle.dump(
            d, open(os.path.join(out_dir, 'train_data_batch_%d' % i), 'wb'))
        curr_index += size
        y_test.extend(d['labels'])
        pass
    d = {
        'data': x[train_indices[curr_index:], :],
        'labels': np.array(y)[train_indices[curr_index:]].tolist(),
        'mean': x_mean
    }
    pickle.dump(d, open(os.path.join(out_dir, 'train_data_batch_10'), 'wb'))
    y_test.extend(d['labels'])

    count = np.zeros([1000])
    for i in y_test:
        count[i - 1] += 1
    for i in range(1000):
        Tools.print('%d : %d' % (i, count[i]))
    Tools.print('SUM: %d' % len(y_test))
    pass