def process_folder(all_png, out_dir): label_dict = utils.get_label_dict() folders = utils.get_ordered_folders() val_ground_dict = utils.get_val_ground_dict() labels_searched = [] for folder in folders: labels_searched.append(label_dict[folder]) labels_list = [] images = [] for image_index, image_name in enumerate(all_png): if image_index % 1000 == 0: Tools.print("{} {}".format(image_index, len(all_png))) basename = os.path.basename(image_name) label = val_ground_dict[basename[:-4]] if label not in labels_searched: continue try: img = imageio.imread(image_name) r = img[:, :, 0].flatten() g = img[:, :, 1].flatten() b = img[:, :, 2].flatten() except: Tools.print('Cant process image {}'.format(basename)) with open("log_img2np_val.txt", "a") as f: f.write("Couldn't read: {}".format(image_name)) continue arr = np.array(list(r) + list(g) + list(b), dtype=np.uint8) images.append(arr) labels_list.append(label) pass data_val = np.row_stack(images) # Can add some kind of data splitting d_val = {'data': data_val, 'labels': labels_list} Tools.new_dir(out_dir) pickle.dump(d_val, open(os.path.join(out_dir, 'val_data'), 'wb')) y_test = d_val['labels'] count = np.zeros([1000]) for i in y_test: count[i - 1] += 1 for i in range(1000): Tools.print('%d : %d' % (i, count[i])) Tools.print('SUM: %d' % len(y_test)) pass
train_df.to_csv(args.data_path + '/train.csv', index=False) val_df.to_csv(args.data_path + '/val.csv', index=False) test_df.to_csv(args.data_path + '/test.csv', index=False) # Load vocab and label from file, if not exist create from data. if os.path.exists(args.data_path + '/' + args.vocab_name) is True: seq_vocab = TokenDictionary.load(args.data_path + '/' + args.vocab_name) else: seq_vocab = build_vocab(train_df['text'].tolist()) seq_vocab.save(args.data_path + '/' + args.vocab_name, delimiter='\t') if os.path.exists(args.data_path + '/' + args.intent_name) is True: label_dict = Dictionary.load(args.data_path + '/' + args.intent_name) else: label_dict = get_label_dict( pd.concat([train_df, test_df, val_df])['label'].tolist()) label_dict.save(args.data_path + '/' + args.intent_name, delimiter='\t') # Create train, val, test dataset. train_dataset = LabelDataset(train_df, seq_vocab, label_dict, multi_label=True, max_seq_len=args.max_seq_len) max_seq_len = train_dataset.max_len_seq val_dataset = LabelDataset(val_df, seq_vocab, label_dict, multi_label=True,
import architecture from architecture import VAE, VAEClassifier from base64 import b64decode from flask import Flask, jsonify, request import io from PIL import Image import torch import torchvision.transforms as T import utils app = Flask(__name__) # Labels LABELS = utils.get_label_dict() # Model device = torch.device('cpu') vae = VAE(1024) vae.load_state_dict(torch.load("./models/vae_epoch50.pth", map_location=device)) model = VAEClassifier(vae, len(LABELS)) model.load_state_dict( torch.load("./models/classifier_epoch25.pth", map_location=device)) model.eval() # Preprocess image def preprocess(img_bytes): image = Image.open(io.BytesIO(img_bytes)) img_transform = T.Compose([ T.Resize(256),
def get_loaders(word_emb_file, train_file, dev_file, test_file, num_ents, clipping=None, no_rel=None): """ Prepares data and returns Pytorch data_loaders. It takes three filenames as arguments as well as the file with the pre-trained word embeddings. The script assumes that all relation types occur at least once in the training data. Data is first split in tokens and types of the entities. Data is truncated to max_len and padded if necessary. Labels are represented as categorical single numbers. """ batch_size = 32 train_tokens, train_types, train_labels = utils.middletokens_types_labels( train_file) dev_tokens, dev_types, dev_labels = utils.middletokens_types_labels( dev_file) test_tokens, test_types, test_labels = utils.middletokens_types_labels( test_file) train_ents, train_dict = utils.get_entity_set(train_file) dev_ents, _ = utils.get_entity_set(dev_file) test_ents, _ = utils.get_entity_set(test_file) ent2idx = utils.get_ent_voc(train_dict, num_ents) train_ents_idx = [ent2idx.get(entity, 0) for entity in train_ents] dev_ents_idx = [ent2idx.get(entity, 0) for entity in dev_ents] test_ents_idx = [ent2idx.get(entity, 0) for entity in test_ents] if no_rel != None: train_tokens, train_types, train_ents_idx, train_labels = no_rel_filter( train_tokens, train_types, train_ents_idx, train_labels) dev_tokens, dev_types, dev_ents_idx, dev_labels = no_rel_filter( dev_tokens, dev_types, dev_ents_idx, dev_labels) test_tokens, test_types, test_ents_idx, test_labels = no_rel_filter( test_tokens, test_types, test_ents_idx, test_labels) ################################################################################################################### ################## clipped data set with only top n entities, equal number of sentences of each entity if clipping != None: cut_off = len([ x for x in train_ents if str(x) == list(ent2idx.keys())[list( ent2idx.values()).index(num_ents - 1)] ]) #freq of last topn item train_tokens, train_types, train_labels, train_ents_idx = create_topn_set( train_tokens, train_types, train_labels, train_ents_idx, cut_off) train_lens = [ len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50 for sent in train_tokens ] dev_lens = [ len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50 for sent in dev_tokens ] test_lens = [ len(sent.split(' ')) if len(sent.split(' ')) <= 50 else 50 for sent in test_tokens ] labels2idx = utils.get_label_dict(train_labels + dev_labels + test_labels) idx2labels = dict([(value, key) for key, value in labels2idx.items()]) train_labels = [labels2idx[label] for label in train_labels] dev_labels = [labels2idx[label] for label in dev_labels] test_labels = [labels2idx[label] for label in test_labels] # Convert text and labels to matrix format. word_to_id, word_embedding_matrix = utils.vocab_and_vectors( word_emb_file, ['<PAD>', '<UNK>']) num_types = 20 type_to_id = get_words_to_id_map(train_types, num_types, "types") maxlen = 50 train_word_matrix = get_text_matrix(train_tokens, word_to_id, maxlen) train_type_matrix = get_text_matrix(train_types, type_to_id, 2) dev_word_matrix = get_text_matrix(dev_tokens, word_to_id, maxlen) dev_type_matrix = get_text_matrix(dev_types, type_to_id, 2) test_word_matrix = get_text_matrix(test_tokens, word_to_id, maxlen) test_type_matrix = get_text_matrix(test_types, type_to_id, 2) mask_matrix = utils.get_mapping_ent2rel( train_ents_idx + dev_ents_idx + test_ents_idx, train_labels + dev_labels + test_labels, len(labels2idx), num_ents) word_input_dim = word_embedding_matrix.shape[0] word_output_dim = word_embedding_matrix.shape[1] train_loader = convert2tensor(train_word_matrix, train_type_matrix, train_labels, train_lens, train_ents_idx, batch_size) valid_loader = convert2tensor(dev_word_matrix, dev_type_matrix, dev_labels, dev_lens, dev_ents_idx, batch_size) test_loader = convert2tensor(test_word_matrix, test_type_matrix, test_labels, test_lens, test_ents_idx, batch_size) return train_loader, valid_loader, test_loader, word_input_dim, word_output_dim, word_embedding_matrix, idx2labels, mask_matrix
def process_folder(in_dir, out_dir): label_dict = utils.get_label_dict() folders = utils.get_ordered_folders() data_list_train = [] labels_list_train = [] for folder in folders: label = label_dict[folder] Tools.print("Processing images from folder %s as label %d" % (folder, label)) images = [] try: for image_name in os.listdir(os.path.join(in_dir, folder)): try: img = imageio.imread( os.path.join(in_dir, folder, image_name)) r = img[:, :, 0].flatten() g = img[:, :, 1].flatten() b = img[:, :, 2].flatten() except Exception: Tools.print('Cant process image %s' % image_name) with open("log_img2np.txt", "a") as f: f.write("Couldn't read: {} \n".format( os.path.join(in_dir, folder, image_name))) continue arr = np.array(list(r) + list(g) + list(b), dtype=np.uint8) images.append(arr) pass data = np.row_stack(images) samples_num = data.shape[0] labels = [label] * samples_num labels_list_train.extend(labels) data_list_train.append(data) Tools.print('Label: %d: %s has %d samples' % (label, folder, samples_num)) except Exception: pass x = np.concatenate(data_list_train, axis=0) y = labels_list_train x_mean = np.mean(x, axis=0) train_indices = np.arange(x.shape[0]) np.random.shuffle(train_indices) curr_index = 0 size = x.shape[0] // 10 y_test = [] Tools.new_dir(out_dir) for i in range(1, 10): d = { 'data': x[train_indices[curr_index:(curr_index + size)], :], 'labels': np.array(y)[train_indices[curr_index:(curr_index + size)]].tolist(), 'mean': x_mean } pickle.dump( d, open(os.path.join(out_dir, 'train_data_batch_%d' % i), 'wb')) curr_index += size y_test.extend(d['labels']) pass d = { 'data': x[train_indices[curr_index:], :], 'labels': np.array(y)[train_indices[curr_index:]].tolist(), 'mean': x_mean } pickle.dump(d, open(os.path.join(out_dir, 'train_data_batch_10'), 'wb')) y_test.extend(d['labels']) count = np.zeros([1000]) for i in y_test: count[i - 1] += 1 for i in range(1000): Tools.print('%d : %d' % (i, count[i])) Tools.print('SUM: %d' % len(y_test)) pass