def export(self, filename, filter_, config=None): """Write the training data to `filename`. Images to be processed are obtained from the database. Which images are obtained and with which classes is set by the filter `filter_`. Image fingerprints are obtained from cache, which must have been created for configuration `config` or `self.config`. """ session, metadata = db.get_session_or_error() if not conf.force_overwrite and os.path.isfile(filename): raise FileExistsError(filename) # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the photos and corresponding classification using the filter. images = db.get_filtered_photos_with_taxon(session, metadata, filter_) images = images.all() if not images: logging.info("No images found for the filter `%s`", filter_) return if self.get_photo_count_min(): assert len(images) >= self.get_photo_count_min(), \ "Expected to find at least photo_count_min={0} photos, found " \ "{1}".format(self.get_photo_count_min(), len(images)) # Calculate the number of images that will be processed, taking into # account the subset. photo_ids = np.array([photo.id for photo, _ in images]) if self.subset: n_images = len(np.intersect1d(list(photo_ids), list(self.subset))) else: n_images = len(images) logging.info("Going to process %d photos...", n_images) # Make a codeword for each class. codewords = get_codewords(classes) # Construct the header. header_data, header_out = self.__make_header(len(classes)) header = ["ID"] + header_data + header_out # Get the configurations. if not config: config = self.config # Load the fingerprint cache. self.cache.load_cache(self.cache_path, config) # Generate the training data. with open(filename, 'w') as fh: # Write the header. fh.write( "%s\n" % "\t".join(header) ) # Set the training data. training_data = TrainData(len(header_data), len(classes)) for photo, class_ in images: # Only export the subset if an export subset is set. if self.subset and photo.id not in self.subset: continue logging.info("Processing `%s` of class `%s`...", photo.path, class_) # Get phenotype for this image from the cache. phenotype = self.cache.get_phenotype(photo.md5sum) assert len(phenotype) == len(header_data), \ "Fingerprint size mismatch. According to the header " \ "there are {0} data columns, but the fingerprint has " \ "{1}".format(len(header_data), len(phenotype)) training_data.append(phenotype, codewords[class_], label=photo.id) training_data.finalize() if not training_data: raise ValueError("Training data cannot be empty") # Round feature data. training_data.round_input(6) # Write data rows. for photo_id, input_, output in training_data: row = [str(photo_id)] row.extend(input_.astype(str)) row.extend(output.astype(str)) fh.write("%s\n" % "\t".join(row)) logging.info("Training data written to %s", filename)
def test_get_filtered_photos_with_taxon(self): """Test the get_filtered_photos_with_taxon() method.""" filter_mexi_species = { 'class': 'species', 'where': { 'genus': 'Mexipedium', 'section': None } } filter_mexi_section = { 'class': 'section', 'where': { 'genus': 'Mexipedium' } } filter_trigo = { 'class': 'species', 'where': { 'genus': 'Cypripedium', 'section': 'Trigonopedia' } } filter_cypr = {'class': 'section', 'where': {'genus': 'Cypripedium'}} filter_cypr_none = { 'class': 'species', 'where': { 'genus': 'Cypripedium', 'section': None } } filter_genera = {'class': 'genus'} with db.session_scope(META_FILE) as (session, metadata): q = db.get_filtered_photos_with_taxon(session, metadata, filter_mexi_species).all() self.assertEqual(len(q), 3) for photo, class_ in q: self.assertEqual(class_, self.expected_taxa[photo.md5sum][2]) q = db.get_filtered_photos_with_taxon(session, metadata, filter_mexi_section).all() self.assertEqual(len(q), 3) for photo, class_ in q: self.assertEqual(class_, self.expected_taxa[photo.md5sum][1]) q = db.get_filtered_photos_with_taxon(session, metadata, filter_trigo).all() self.assertEqual(len(q), 6) taxa = set([taxon for photo, taxon in q]) self.assertEqual(taxa, set(['fargesii', 'sichuanense'])) q = db.get_filtered_photos_with_taxon(session, metadata, filter_cypr).all() self.assertEqual(len(q), 13) taxa = set([taxon for photo, taxon in q]) self.assertEqual( taxa, set(['Arietinum', 'Obtusipetala', 'Trigonopedia'])) q = db.get_filtered_photos_with_taxon(session, metadata, filter_cypr_none).all() self.assertEqual(len(q), 0) q = db.get_filtered_photos_with_taxon(session, metadata, filter_genera).all() self.assertEqual(len(q), len(self.expected_taxa)) taxa = set([taxon for photo, taxon in q]) self.assertEqual( taxa, set([ 'Cypripedium', 'Mexipedium', 'Paphiopedilum', 'Selenipedium', 'Phragmipedium' ]))
def export(self, filename, filter_, config=None, codebook_file=None): """Write the training data to `filename`. Images to be processed are obtained from the database. Which images are obtained and with which classes is set by the filter `filter_`. Image fingerprints are obtained from cache, which must have been created for configuration `config` or `self.config`. """ session, metadata = db.get_session_or_error() if not conf.force_overwrite and os.path.isfile(filename): raise FileExistsError(filename) # Get the classification categories from the database. classes = db.get_classes_from_filter(session, metadata, filter_) assert len(classes) > 0, \ "No classes found for filter `%s`" % filter_ # Get the photos and corresponding classification using the filter. images = db.get_filtered_photos_with_taxon(session, metadata, filter_) images = images.all() if not images: logging.info("No images found for the filter `%s`", filter_) return if self.get_photo_count_min(): assert len(images) >= self.get_photo_count_min(), \ "Expected to find at least photo_count_min={0} photos, found " \ "{1}".format(self.get_photo_count_min(), len(images)) # Calculate the number of images that will be processed, taking into # account the subset. photo_ids = np.array([photo.id for photo, _ in images]) if self.subset: n_images = len(np.intersect1d(list(photo_ids), list(self.subset))) else: n_images = len(images) logging.info("Going to process %d photos...", n_images) # Make a codeword for each class. codewords = get_codewords(classes) # Construct the header. header_data, header_out = self.__make_header(len(classes)) header = ["ID"] + header_data + header_out # Get the configurations. if not config: config = self.config # Load the fingerprint cache. self.cache.load_cache(self.cache_path, config) # Check if the BagOfWords alogrithm needs to be applied. use_bow = getattr(self.config.features['surf'], 'bow_clusters', False) if use_bow and codebook_file == None: codebook = self.__make_codebook(images, filename) elif use_bow: with open(codebook_file, "rb") as cb: codebook = load(cb) # Generate the training data. with open(filename, 'w') as fh: # Write the header. fh.write("%s\n" % "\t".join(header)) # Set the training data. training_data = TrainData(len(header_data), len(classes)) for photo, class_ in images: # Only export the subset if an export subset is set. if self.subset and photo.id not in self.subset: continue logging.info("Processing `%s` of class `%s`...", photo.path, class_) # Get phenotype for this image from the cache. phenotype = self.cache.get_phenotype(photo.md5sum) # If the BagOfWords algorithm is applied, # convert phenotype to BOW-code. if use_bow: phenotype = get_bowcode_from_surf_features( phenotype, codebook) assert len(phenotype) == len(header_data), \ "Fingerprint size mismatch. According to the header " \ "there are {0} data columns, but the fingerprint has " \ "{1}".format(len(header_data), len(phenotype)) training_data.append(phenotype, codewords[class_], label=photo.id) training_data.finalize() if not training_data: raise ValueError("Training data cannot be empty") # Round feature data only if BOW is not applied. if not use_bow: training_data.round_input(6) # Write data rows. for photo_id, input_, output in training_data: row = [str(photo_id)] row.extend(input_.astype(str)) row.extend(output.astype(str)) fh.write("%s\n" % "\t".join(row)) logging.info("Training data written to %s", filename)