class Splitter(object): def __init__(self, data_path, split_ratio): self._images_path = os.path.join(data_path, 'images') self._out_path = os.path.join(data_path, 'images-split') self._split_ratio = split_ratio self._parser = Parser(data_path) def create_directories(self, labels): shutil.rmtree(self._out_path, True) os.makedirs(self._out_path) for d in ['test', 'train']: for label in labels: os.makedirs(os.path.join(self._out_path, d, label)) def get_train_test_image_list(self, image_list): np.random.shuffle(image_list) train_images, test_images = np.split(np.array(image_list), [int(len(image_list) * self._split_ratio), ]) return train_images, test_images def copy_images(self, images, base_dir, label): for image in images: src = os.path.join(self._images_path, image) dst = os.path.join(self._out_path, base_dir, label, image) if os.path.isfile(src): print(f'Copying {src} to {dst}') shutil.copy(src, dst) def split(self): self._parser.parse() self.create_directories(self._parser.labels) for label in self._parser.labels: train_images, test_images = self.get_train_test_image_list(self._parser.get_label_images(label)) self.copy_images(train_images, 'train', label) self.copy_images(test_images, 'test', label) def split_binary(self): self._parser.parse() self.create_directories(self._parser.binary_labels) train_images, test_images = self.get_train_test_image_list(self._parser.get_no_anomaly_images()) self.copy_images(train_images, 'train', 'No-Anomaly') self.copy_images(test_images, 'test', 'No-Anomaly') train_images, test_images = self.get_train_test_image_list(self._parser.get_anomaly_images()) self.copy_images(train_images, 'train', 'Anomaly') self.copy_images(test_images, 'test', 'Anomaly')
class DataAnalyzer(object): def __init__(self, data_path): self._images_path = os.path.join(data_path, 'images') self._parser = Parser(data_path) self._results = MdUtils(file_name='results', title='Overview') self._num_images = None self._counts = None self._image_shape = None self._image_shape_mean = None def _compute_stats(self): self._num_images = self._parser.data.shape[1] self._counts = self._parser.data.loc['anomaly_class'].value_counts( ).to_dict() def _plot_random_image(self): random_image_file = f'{random.randint(0, self._num_images)}.jpg' image = img.imread(os.path.join(self._images_path, random_image_file)) self._image_shape = image.shape fig = plt.figure() plt.tight_layout() plt.imshow(image) plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.title('Random Image') plt.savefig('random_image.png') plt.close(fig) plt.figure() plt.tight_layout() plt.hist(image.flatten()) plt.xlabel('Pixel Value') plt.ylabel('Counts') plt.title('Histogram') plt.savefig('random_image_histogram.png') plt.close(fig) def _compute_mean_shape(self): h, w = [], [] for im in self._parser.image_list: print(f'Reading image: {im}') image = img.imread(os.path.join(self._images_path, im)) h.append(image.shape[0]) w.append(image.shape[1]) self._image_shape_mean = (np.mean(h), np.mean(w)) def _plot_image_each_class(self): fig = plt.figure(figsize=(10, 10)) plt.tight_layout() plt.title('Random Image In Each Class') i = 1 for label in self._parser.labels: image_list = self._parser.get_label_images(label) random_selection = random.choice(image_list) image_path = self._parser.data.loc['image_filepath'].tolist( )[random_selection] image = img.imread(os.path.join(self._images_path, image_path[7:])) plt.subplot(4, 3, i) i += 1 plt.subplots_adjust(hspace=1, wspace=1) plt.title(f'Class: {label}') cur_axes = plt.gca() cur_axes.axes.get_xaxis().set_ticks([]) cur_axes.axes.get_yaxis().set_ticks([]) plt.imshow(np.uint8(image)) plt.savefig('random_image_each_class.png') plt.close(fig) def analyze(self): self._parser.parse() self._compute_stats() self._plot_random_image() self._compute_mean_shape() self._plot_image_each_class() def save_results(self): self._results.new_paragraph(f'Number of images: {self._num_images}') self._results.new_paragraph( f'Number of unique classes: {len(self._parser.labels)}') self._results.new_paragraph(f'Class names:') self._results.new_list(items=self._parser.labels) self._results.new_paragraph(f'Number of images per class: ') self._results.new_list( items=[f'{k}: {v}' for k, v in self._counts.items()]) self._results.new_paragraph(f'Image shape: {self._image_shape}') self._results.new_paragraph( f'Mean Image shape: {self._image_shape_mean}') self._results.new_paragraph( self._results.new_inline_image(text='Random Image', path='random_image.png')) self._results.new_paragraph( self._results.new_inline_image(text='Histogram', path='random_image_histogram.png')) self._results.new_paragraph( self._results.new_inline_image(text='Classes', path='random_image_each_class.png')) self._results.create_md_file()
#!/usr/bin/python import os, sys from data.parser import Parser sys.path.append(os.path.dirname(__file__)) data_filename='news_tagged_data.txt' if __name__ == "__main__": parser = Parser(data_filename) X,Y = parser.parse() print X print Y