def __init__(self, data_config, transform=False, mode="train", visualize=False): # Set the mode (train/val) self.mode = mode self.visualize = visualize # Read in necessary configs file_data_path = data_config[mode] self.image_directory = data_config["image_directory"] self.annotation_directory = data_config["annotation_directory"] self.classes = tuple(data_config['classes']) self.file_list = self.create_file_list(file_data_path) self.transform = transform self.transformations = iaa.Noop() self.height = data_config["figure_size"] self.width = data_config["figure_size"] self.resize_transformation = iaa.Resize({ "height": self.height, "width": self.width }) self.data_encoder = DataEncoder(data_config) self.default_boxes = self.data_encoder.default_boxes
def __init__(self, threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, model_name=None): self.threshold = threshold self.es = Elasticsearch(server_url) self.index_name = index_name self.num_groups = num_groups self.num_clusters = num_clusters self.query_vector = query_vector self.dist_function_name = dist_function_name self.model_name = model_name if self.model_name == None: self.data_encoder = DataEncoder( num_groups, num_clusters, 1000, query_vector.reshape(1, query_vector.shape[0]), 'encode_results') else: self.data_encoder = DataEncoder( num_groups, num_clusters, 1000, query_vector.reshape(1, query_vector.shape[0]), 'encode_results_vgg')
def __init__(self, data_location, output_folder, threads, language, split_size, logger): self.data_location = data_location self.output_folder = output_folder self.threads = threads self.language = language self.data_encoder = DataEncoder(data_location, output_folder, threads, language) self.split_size = split_size self.logger = logger
def setUp(self): with open("tests/test_config.yaml") as f: config = yaml.safe_load(f) self.data_config = config["data_configuration"] self.data_encoder = DataEncoder(self.data_config) self.total_num_boxes = 0 for idx, feature_map in enumerate(self.data_config['feature_maps']): num_aspect_ratios = 2 + \ len(self.data_config['aspect_ratios'][idx]) * 2 self.total_num_boxes += (feature_map * feature_map) * num_aspect_ratios
def prepare_data(self): self.initial_setup(self.output_folder) data_folders = self.extract_data_folders() for data_folder in data_folders: data_folder_path, data_folder_name = data_folder self.data_to_lmdb(data_folder_name, data_folder_path) data_encoder = DataEncoder(data_folder_path, self.output_folder, self.threads, self.language, data_folder_name) data_encoder.encode_data() self.make_fasta_file(data_folder_name) self.clean_encoded() self.logger.info("Added DB: {} \t {} new texts".format( data_folder_name, self.text_counts[-1][1])) self.save_text_counts() self.make_db()
def __init__(self, data, output_folder, e_value, word_size, threads, text_count, logger, language="FIN"): self.data_location = data self.output_folder = output_folder self.e_value = e_value self.word_size = word_size self.threads = threads self.text_count = text_count self.logger = logger self.data_encoder = DataEncoder(data, output_folder, threads, language)
def main(): config = load_configurations() encoder_config = config['simple_model_configuration'] data_encoder = DataEncoder(encoder_config) image_path = "data/val_images/000000086220.jpg" image = cv2.imread(image_path) aug = iaa.Resize({"height": 300, "width": 300}) resized_image = aug(image=image) default_boxes = data_encoder.default_boxes box_groups = get_box_groups(encoder_config, default_boxes) set_of_boxes = random.choice(box_groups[10]) set_of_boxes = (set_of_boxes * 300) print(set_of_boxes) vis_boxes = [] for box in set_of_boxes: vis_boxes.append(BoundingBox( x1=box[0], y1=box[1], x2=box[2], y2=box[3] ) ) bbs = BoundingBoxesOnImage(vis_boxes, shape=resized_image.shape) image_with_box = bbs.draw_on_image(resized_image, size=2) cv2.imwrite("example.png", image_with_box)
def __init__(self): self._weight_id = 0 self._data_encoder = DataEncoder()
class ModelSaver: def __init__(self): self._weight_id = 0 self._data_encoder = DataEncoder() def _get_weight_id(self) -> int: self._weight_id += 1 return self._weight_id def _save_weight(self, w: np.ndarray) -> int: wid = self._get_weight_id() self._data_encoder.add_tensor_entry((wid, w)) return wid def _save_activation_func(self, name: str, func: Callable) -> Dict: layer_type = LayerType.get_for_activation_func(func) if layer_type is None: raise NotSupportedException( f'Activation function "{func.__name__}" is not supported.') return { 'name': name, 'type': layer_type.value, } def _save_activation_layer(self, layer: Activation) -> Dict: layer_type = LayerType.get_for_activation_func_name(layer.activation) if layer_type is None: raise NotSupportedException( f'Activation function "{layer.activation}" is not supported.') return { 'name': layer.name, 'type': layer_type.value, } def _save_softmax_layer(self, layer: Softmax) -> Dict: return { 'name': layer.name, 'type': LayerType.SOFTMAX.value, 'axis': layer.axis if layer.axis != -1 else None } def _save_dense(self, layer: Dense) -> List[Dict]: weights: List[np.ndarray] = backend.batch_get_value(layer.weights) layer_dicts = [{ 'name': layer.name, 'type': LayerType.DENSE.value, 'w_shape': weights[0].shape, 'w_id': self._save_weight(weights[0]), 'b_shape': weights[1].shape if layer.use_bias else None, 'b_id': self._save_weight(weights[1]) if layer.use_bias else None, }] if layer.activation is not None and layer.activation is not activations.linear: layer_dicts.append( self._save_activation_func(name=f'{layer.name}__activation', func=layer.activation)) return layer_dicts def save_model(self, model: Model, file_path: 'str'): layer_dicts = [] layers: List[Layer] = model.layers for l in layers: lt = get_layer_type(l) if lt == Dense.__name__: layer_dicts.extend(self._save_dense(l)) elif lt == Activation.__name__: layer_dicts.append(self._save_activation_layer(l)) elif lt == Softmax.__name__: layer_dicts.append(self._save_softmax_layer(l)) self._data_encoder.add_header_entry({ 'name': model.name, 'layers': layer_dicts }) with open(file_path, 'wb') as f: f.write(self._data_encoder.encode())
class DataPreparer: def __init__(self, data_location, output_folder, threads, language, split_size, logger): self.data_location = data_location self.output_folder = output_folder self.threads = threads self.language = language self.data_encoder = DataEncoder(data_location, output_folder, threads, language) self.split_size = split_size self.logger = logger def make_directory(self, where): if not os.path.exists(where): os.makedirs(where) def prepare_data(self): self.initial_setup(self.output_folder) self.data_to_lmdb() self.logger.info("Encoding data to proteins...") self.data_encoder.encode_data() self.generate_db() def split_text_into_blocks(self, block): if self.split_size > 0: for i in range(0, len(block["text"]), self.split_size): yield block["text"][i:i + self.split_size], str( block["doc_id"]) + "__{}_{}".format( i, i + self.split_size) else: yield block["text"], str(block["doc_id"]) ## Generate a LMDB DB for the original data. Helps in reconstructing phase, this is done with just ONE thread :c def data_to_lmdb(self): self.logger.info("Loading original data into databases...") text_db, info_db = self.open_databases() files, folder = self.get_data_files() with text_db.begin(write=True) as t_db, info_db.begin( write=True) as i_db: for filename in files: with gzip.open(folder + "/" + filename, "rt") as data_file: for line in data_file: if not line: continue block = json.loads(line.strip()) if len(block["text"]) == 0: continue for block_i, split_block in enumerate( self.split_text_into_blocks(block)): text_block, new_id = split_block block_data = {} for k, v in block.items(): block_data[k] = v block_data["text"] = text_block ## Only ASCII in doc_id! BLAST doesn't like non ascii characters in fsa doc_id = new_id.encode("ascii", errors="ignore") t_db.put( doc_id, block_data["text"].encode("unicode-escape")) del block_data["text"], block_data["doc_id"] i_db.put( doc_id, json.dumps(block_data).encode( "unicode-escape")) ##TODO BUT DOES IT WORK? def get_data_files(self): if os.path.isdir(self.data_location): files = os.listdir(self.data_location) folder = self.data_location else: files = [self.data_location.split("/")[1]] folder = self.data_location.split("/")[0] return files, folder def open_databases(self): text_env = lmdb.open(self.output_folder + "/db/original_data_DB", map_size=50000000000) info_env = lmdb.open(self.output_folder + "/db/info_DB", map_size=5000000000) return text_env, info_env ## Generate the protein database for BLAST def generate_db(self): self.logger.info("Generating protein database..") self.make_fasta_file() self.make_db() def make_fasta_file(self): encoded_db = lmdb.open(self.output_folder + "/db/encoded_data_DB", readonly=True) gi = 1 with encoded_db.begin() as db: with open(self.output_folder + "/db/database.fsa", "w") as fasta_file: for key, value in db.cursor(): doc_id = key.decode("utf-8") text = value.decode("unicode-escape") begin = ">gi|{} {}".format(gi, doc_id) fasta_file.write("{}\n{}\n".format(begin, text)) gi += 1 self.text_count = gi - 1 def get_text_count(self): return self.text_count ## Make the DB using makeblastdb def make_db(self): subprocess.call( "makeblastdb -in {} -dbtype prot -title TextDB -parse_seqids -hash_index -out {}" .format(self.output_folder + "/db/database.fsa", self.output_folder + "/db/textdb").split(" ")) self.db_loc = self.output_folder + "/db/textdb" ## Make intial folders for later def initial_setup(self, where): self.logger.info("Performing initial setups...") self.make_directory(where) for location in [ "encoded", "db", "subgraphs", "info", "batches", "clusters", "clusters/unfilled", "clusters/filled" ]: self.make_directory(where + "/" + location)
class ImageDataset(Dataset): def __init__(self, data_config, transform=False, mode="train", visualize=False): # Set the mode (train/val) self.mode = mode self.visualize = visualize # Read in necessary configs file_data_path = data_config[mode] self.image_directory = data_config["image_directory"] self.annotation_directory = data_config["annotation_directory"] self.classes = tuple(data_config['classes']) self.file_list = self.create_file_list(file_data_path) self.transform = transform self.transformations = iaa.Noop() self.height = data_config["figure_size"] self.width = data_config["figure_size"] self.resize_transformation = iaa.Resize({ "height": self.height, "width": self.width }) self.data_encoder = DataEncoder(data_config) self.default_boxes = self.data_encoder.default_boxes def __len__(self): return len(self.file_list) def __getitem__(self, idx): # Select filename from list filename = self.file_list[idx] # Load image image = self.load_image(filename) # Load annotations for image labels = self.load_labels_from_annotation(filename) labels = BoundingBoxesOnImage(labels, shape=image.shape) # Perform transformations on the data if self.transform: image, labels = self.transformations( image=image, bounding_boxes=labels ) # Resize data regardless of train or test image, labels = self.resize_transformation( image=image, bounding_boxes=labels ) labels = labels.bounding_boxes labels = torch.Tensor([np.append(box.coords.flatten(), box.label) for box in labels]) image = torch.Tensor(image) if self.mode is "train": labels = self.data_encoder.encode(labels) if self.visualize: return (image, labels) # This seems bad, but I will revist it later when I check for bottlenecks image = image.permute(2, 0, 1) return (image, labels) def create_file_list(self, file_data_path: str) -> list: df = pd.read_csv(file_data_path, names=["filename"]) df['filename'] = df['filename'].str.replace(r"\s\s\d", "") file_list = df['filename'].unique().tolist() return file_list def load_image(self, filename: str) -> np.array: path_to_file = os.path.join(self.image_directory, filename + ".jpg") image = cv2.imread(path_to_file) return image def load_labels_from_annotation(self, filename: str) -> list: path_to_file = os.path.join(self.annotation_directory, filename + ".xml") with open(path_to_file) as fd: annotation = xmltodict.parse(fd.read()) objects = annotation["annotation"]['object'] if type(objects) is not list: objects = [objects] labels = [] for obj in objects: box = obj['bndbox'] labels.append(BoundingBox( x1=box["xmin"], y1=box["ymin"], x2=box["xmax"], y2=box["ymax"], label=self.classes.index(obj["name"]) )) return labels
def main_1(var): num_groups = int(var[0]) num_clusters = int(var[1]) if var[2] >= 50: dist_function_name = 'euclidean' else: dist_function_name = 'cosine' threshold = var[3] server_url = 'localhost:9200' num_queries = 200 with open('evaluation_set.json') as f: evaluation_set = json.load(f) f.close() training_embedding_vectors = np.load("PCA_2048_to_512_new.npy") query_vector_indices = random.sample(range(len(evaluation_set.keys())), num_queries) train_labels, image_names = get_image_data( 'vn_celeb_face_recognition/train.csv') # print("working on {} groups, {} clusters, {} threshold".format(num_groups, num_clusters, threshold)) search_times = [] mean_average_accuracy = 0 mean_recall = 0 for query_vector_index in query_vector_indices: query_vector = training_embedding_vectors[evaluation_set[str( query_vector_index)][0]] # print(query_vector) actual_query_label = train_labels[evaluation_set[str( query_vector_index)][0]] num_actual_results = len(evaluation_set[str(actual_query_label)]) # print(actual_query_label) # print("------------") es = Elasticsearch(server_url) index_name = 'face_off_' + str(num_groups) + 'groups_' + str( num_clusters) + 'clusters_vgg' if not es.indices.exists( index_name ): # if data is not indexed, create index and take data to ES # then query data_encoder = DataEncoder(num_groups, num_clusters, 1000, training_embedding_vectors, 'encode_results_vgg') data_encoder.run_encode_data() json_string_tokens_generator = JsonStringTokenGenerator( 'encode_results_vgg', 'PCA_2048_to_512_new.npy', 'vn_celeb_face_recognition/train.csv', num_groups, num_clusters) encoded_string_tokens_list = json_string_tokens_generator.get_string_tokens_list( ) train_embs = json_string_tokens_generator.get_image_fetures() train_labels, image_names = json_string_tokens_generator.get_image_metadata( ) json_string_tokens_list = json_string_tokens_generator.generate_json_string_tokens_list( encoded_string_tokens_list, train_labels, image_names, train_embs) json_string_tokens_generator.save_json_string_tokens( json_string_tokens_list) print('saving completed....') print('******************************') indexer = ESIndexer('encode_results_vgg', num_groups, num_clusters, server_url, 'vgg') indexer.index() start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) else: # if not, commit query start_time = datetime.now() searcher = Searcher(threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, 'vgg') results = searcher.search() # print(len(results)) if len(results) == 0: continue search_time = datetime.now() - start_time search_time_in_ms = (search_time.days * 24 * 60 * 60 + search_time.seconds) * 1000 + \ search_time.microseconds / 1000.0 search_times.append(search_time_in_ms) results_labels = list() for result in results: results_labels.append(result['id']) # with open('evaluation_set.json', 'r') as fh: # evaluation_set_dict = json.load(fh) # fh.close() accuracy_i = 0 for i in range(len(results)): step_list = results_labels[:(i + 1)] num_corrects = len([ i for i, x in enumerate(step_list) if x == actual_query_label ]) accuracy_i += num_corrects / len(step_list) # print(accuracy_i/num_returns) mean_average_accuracy += accuracy_i / len(results) recall_i = num_corrects / num_actual_results # print(num_corrects) mean_recall += recall_i # print("*************************************") mean_average_accuracy = mean_average_accuracy / num_queries mean_recall = mean_recall / num_queries print(mean_average_accuracy, mean_recall) # print("precision: {} and recall: {}".format(mean_average_accuracy, mean_recall)) # print(average_search_time) # print(mean_average_accuracy) return 3 - mean_average_accuracy - mean_recall - ( 2 * mean_average_accuracy * mean_recall / (mean_average_accuracy + mean_recall))
class Searcher(object): def __init__(self, threshold, num_groups, num_clusters, query_vector, server_url, index_name, dist_function_name, model_name=None): self.threshold = threshold self.es = Elasticsearch(server_url) self.index_name = index_name self.num_groups = num_groups self.num_clusters = num_clusters self.query_vector = query_vector self.dist_function_name = dist_function_name self.model_name = model_name if self.model_name == None: self.data_encoder = DataEncoder( num_groups, num_clusters, 1000, query_vector.reshape(1, query_vector.shape[0]), 'encode_results') else: self.data_encoder = DataEncoder( num_groups, num_clusters, 1000, query_vector.reshape(1, query_vector.shape[0]), 'encode_results_vgg') def get_string_tokens(self): kmeans, query_string_tokens_list = self.data_encoder.encode_string_tokens( ) return query_string_tokens_list[0] def get_query_request_body(self, query_string_tokens): string_tokens_body = list() for i in range(self.num_groups): sub_field = { "filter": { "term": { "image_encoded_tokens": query_string_tokens[i] } }, "weight": 1 } string_tokens_body.append(sub_field) # RETRIEVE ONLY request_body = { "size": 30, "query": { "function_score": { "functions": string_tokens_body, "score_mode": "sum", "boost_mode": "replace" } } } return request_body def get_result_from_es(self): query_string_tokens = self.get_string_tokens() # print(query_string_tokens) request_body = self.get_query_request_body(query_string_tokens) res = self.es.search(index=self.index_name, body=request_body) # Print Results in console. results_from_es = [] for result in res['hits']['hits']: results_from_es.append(result['_source']) # print(result['_source']) return results_from_es def re_rank(self): results_from_es = self.get_result_from_es() result_dist = [] final_results = [] for result_from_es in results_from_es: result_actual_vector = np.asarray( result_from_es['image_actual_vector']) # result_dist.append(np.linalg.norm(self.query_vector-result_actual_vector)) # result_dist.append(spatial.distance.cosine(self.query_vector, result_actual_vector)) if self.dist_function_name == "euclidean": dist = np.linalg.norm(self.query_vector - result_actual_vector) if self.dist_function_name == 'cosine': dist = spatial.distance.cosine(self.query_vector, result_actual_vector) # print(dist) if dist <= self.threshold: final_results.append(result_from_es) # sorted_index = np.argsort(result_dist) # for i in range(self.num_returns): # index = np.where(sorted_index==i)[0][0] # final_results.append(results_from_es[index]) return final_results def search(self): return self.re_rank()
class TestEncoder(unittest.TestCase): def setUp(self): with open("tests/test_config.yaml") as f: config = yaml.safe_load(f) self.data_config = config["data_configuration"] self.data_encoder = DataEncoder(self.data_config) self.total_num_boxes = 0 for idx, feature_map in enumerate(self.data_config['feature_maps']): num_aspect_ratios = 2 + \ len(self.data_config['aspect_ratios'][idx]) * 2 self.total_num_boxes += (feature_map * feature_map) * num_aspect_ratios def test_default_box_shape(self): self.assertEqual( self.total_num_boxes, len(self.data_encoder.default_boxes) ) def test_single_iou_output_is_correct(self): box1 = torch.tensor([[200.0, 300.0, 350.0, 400.0]]) box2 = torch.tensor([[250.0, 250.0, 400.0, 350.0]]) ious = self.data_encoder.calculate_iou(box1, box2) # Compares each value, then gives a single boolean for whether they all match or not equivalence_check = torch.all(torch.eq(ious, torch.tensor([[0.2]]))) self.assertTrue(equivalence_check.numpy()) def test_multiple_iou_output_is_correct(self): box1 = torch.tensor([ [200.0, 300.0, 350.0, 400.0], [250.0, 250.0, 400.0, 350.0] ]) box2 = torch.tensor([ [250.0, 250.0, 400.0, 350.0], [350.0, 300.0, 550.0, 400.0] ]) ious = self.data_encoder.calculate_iou(box1, box2) # Compares each value, then gives a single boolean for whether they all match or not # One box exists in both box1 and box2 so its value is 1 # One set of boxes don't overlap at all so they come out to zero equivalence_check = torch.all( torch.eq(ious, torch.tensor([ [0.2000, 0.0000], # box1 row 1 ious for box2 [1.0000, 0.07692307692] # box1 row 2 ious for box 2 ])) ) self.assertTrue(equivalence_check.numpy()) def test_shape_of_encoder_output(self): temp_tensors = torch.Tensor([ [30.6000, 64.0000, 73.2000, 212.8000, 14.0000], [220.2000, 73.6000, 276.6000, 252.0000, 14.0000] ]) result = self.data_encoder.encode(temp_tensors) self.assertListEqual( list(result.shape), [self.total_num_boxes, 5] ) def test_encoder_output(self): temp_tensors = torch.Tensor([ [30.6000, 64.0000, 73.2000, 212.8000, 14.0000], [220.2000, 73.6000, 276.6000, 252.0000, 14.0000] ]) ious = self.data_encoder.calculate_iou( temp_tensors[:, 0:4], self.data_encoder.default_boxes) print(ious)