Exemplo n.º 1
0
    def export(self, filename, filter_, config=None):
        """Write the training data to `filename`.

        Images to be processed are obtained from the database. Which images are
        obtained and with which classes is set by the filter `filter_`. Image
        fingerprints are obtained from cache, which must have been created for
        configuration `config` or `self.config`.
        """
        session, metadata = db.get_session_or_error()

        if not conf.force_overwrite and os.path.isfile(filename):
            raise FileExistsError(filename)

        # Get the classification categories from the database.
        classes = db.get_classes_from_filter(session, metadata, filter_)
        assert len(classes) > 0, \
            "No classes found for filter `%s`" % filter_

        # Get the photos and corresponding classification using the filter.
        images = db.get_filtered_photos_with_taxon(session, metadata, filter_)
        images = images.all()

        if not images:
            logging.info("No images found for the filter `%s`", filter_)
            return

        if self.get_photo_count_min():
            assert len(images) >= self.get_photo_count_min(), \
                "Expected to find at least photo_count_min={0} photos, found " \
                "{1}".format(self.get_photo_count_min(), len(images))

        # Calculate the number of images that will be processed, taking into
        # account the subset.
        photo_ids = np.array([photo.id for photo, _ in images])

        if self.subset:
            n_images = len(np.intersect1d(list(photo_ids), list(self.subset)))
        else:
            n_images = len(images)

        logging.info("Going to process %d photos...", n_images)

        # Make a codeword for each class.
        codewords = get_codewords(classes)

        # Construct the header.
        header_data, header_out = self.__make_header(len(classes))
        header = ["ID"] + header_data + header_out

        # Get the configurations.
        if not config:
            config = self.config

        # Load the fingerprint cache.
        self.cache.load_cache(self.cache_path, config)

        # Generate the training data.
        with open(filename, 'w') as fh:
            # Write the header.
            fh.write( "%s\n" % "\t".join(header) )

            # Set the training data.
            training_data = TrainData(len(header_data), len(classes))

            for photo, class_ in images:
                # Only export the subset if an export subset is set.
                if self.subset and photo.id not in self.subset:
                    continue

                logging.info("Processing `%s` of class `%s`...",
                    photo.path, class_)

                # Get phenotype for this image from the cache.
                phenotype = self.cache.get_phenotype(photo.md5sum)

                assert len(phenotype) == len(header_data), \
                    "Fingerprint size mismatch. According to the header " \
                    "there are {0} data columns, but the fingerprint has " \
                    "{1}".format(len(header_data), len(phenotype))

                training_data.append(phenotype, codewords[class_],
                    label=photo.id)

            training_data.finalize()

            if not training_data:
                raise ValueError("Training data cannot be empty")

            # Round feature data.
            training_data.round_input(6)

            # Write data rows.
            for photo_id, input_, output in training_data:
                row = [str(photo_id)]
                row.extend(input_.astype(str))
                row.extend(output.astype(str))
                fh.write("%s\n" % "\t".join(row))

        logging.info("Training data written to %s", filename)
Exemplo n.º 2
0
    def test_get_filtered_photos_with_taxon(self):
        """Test the get_filtered_photos_with_taxon() method."""
        filter_mexi_species = {
            'class': 'species',
            'where': {
                'genus': 'Mexipedium',
                'section': None
            }
        }

        filter_mexi_section = {
            'class': 'section',
            'where': {
                'genus': 'Mexipedium'
            }
        }

        filter_trigo = {
            'class': 'species',
            'where': {
                'genus': 'Cypripedium',
                'section': 'Trigonopedia'
            }
        }

        filter_cypr = {'class': 'section', 'where': {'genus': 'Cypripedium'}}

        filter_cypr_none = {
            'class': 'species',
            'where': {
                'genus': 'Cypripedium',
                'section': None
            }
        }

        filter_genera = {'class': 'genus'}

        with db.session_scope(META_FILE) as (session, metadata):
            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_mexi_species).all()
            self.assertEqual(len(q), 3)
            for photo, class_ in q:
                self.assertEqual(class_, self.expected_taxa[photo.md5sum][2])

            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_mexi_section).all()
            self.assertEqual(len(q), 3)
            for photo, class_ in q:
                self.assertEqual(class_, self.expected_taxa[photo.md5sum][1])

            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_trigo).all()
            self.assertEqual(len(q), 6)
            taxa = set([taxon for photo, taxon in q])
            self.assertEqual(taxa, set(['fargesii', 'sichuanense']))

            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_cypr).all()
            self.assertEqual(len(q), 13)
            taxa = set([taxon for photo, taxon in q])
            self.assertEqual(
                taxa, set(['Arietinum', 'Obtusipetala', 'Trigonopedia']))

            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_cypr_none).all()
            self.assertEqual(len(q), 0)

            q = db.get_filtered_photos_with_taxon(session, metadata,
                                                  filter_genera).all()
            self.assertEqual(len(q), len(self.expected_taxa))
            taxa = set([taxon for photo, taxon in q])
            self.assertEqual(
                taxa,
                set([
                    'Cypripedium', 'Mexipedium', 'Paphiopedilum',
                    'Selenipedium', 'Phragmipedium'
                ]))
Exemplo n.º 3
0
    def export(self, filename, filter_, config=None, codebook_file=None):
        """Write the training data to `filename`.

        Images to be processed are obtained from the database. Which images are
        obtained and with which classes is set by the filter `filter_`. Image
        fingerprints are obtained from cache, which must have been created for
        configuration `config` or `self.config`.
        """
        session, metadata = db.get_session_or_error()

        if not conf.force_overwrite and os.path.isfile(filename):
            raise FileExistsError(filename)

        # Get the classification categories from the database.
        classes = db.get_classes_from_filter(session, metadata, filter_)
        assert len(classes) > 0, \
            "No classes found for filter `%s`" % filter_

        # Get the photos and corresponding classification using the filter.
        images = db.get_filtered_photos_with_taxon(session, metadata, filter_)
        images = images.all()

        if not images:
            logging.info("No images found for the filter `%s`", filter_)
            return

        if self.get_photo_count_min():
            assert len(images) >= self.get_photo_count_min(), \
                "Expected to find at least photo_count_min={0} photos, found " \
                "{1}".format(self.get_photo_count_min(), len(images))

        # Calculate the number of images that will be processed, taking into
        # account the subset.
        photo_ids = np.array([photo.id for photo, _ in images])

        if self.subset:
            n_images = len(np.intersect1d(list(photo_ids), list(self.subset)))
        else:
            n_images = len(images)

        logging.info("Going to process %d photos...", n_images)

        # Make a codeword for each class.
        codewords = get_codewords(classes)

        # Construct the header.
        header_data, header_out = self.__make_header(len(classes))
        header = ["ID"] + header_data + header_out

        # Get the configurations.
        if not config:
            config = self.config

        # Load the fingerprint cache.
        self.cache.load_cache(self.cache_path, config)

        # Check if the BagOfWords alogrithm needs to be applied.
        use_bow = getattr(self.config.features['surf'], 'bow_clusters', False)
        if use_bow and codebook_file == None:
            codebook = self.__make_codebook(images, filename)
        elif use_bow:
            with open(codebook_file, "rb") as cb:
                codebook = load(cb)

        # Generate the training data.
        with open(filename, 'w') as fh:
            # Write the header.
            fh.write("%s\n" % "\t".join(header))

            # Set the training data.
            training_data = TrainData(len(header_data), len(classes))

            for photo, class_ in images:
                # Only export the subset if an export subset is set.
                if self.subset and photo.id not in self.subset:
                    continue

                logging.info("Processing `%s` of class `%s`...", photo.path,
                             class_)

                # Get phenotype for this image from the cache.
                phenotype = self.cache.get_phenotype(photo.md5sum)

                # If the BagOfWords algorithm is applied,
                # convert phenotype to BOW-code.
                if use_bow:
                    phenotype = get_bowcode_from_surf_features(
                        phenotype, codebook)

                assert len(phenotype) == len(header_data), \
                    "Fingerprint size mismatch. According to the header " \
                    "there are {0} data columns, but the fingerprint has " \
                    "{1}".format(len(header_data), len(phenotype))

                training_data.append(phenotype,
                                     codewords[class_],
                                     label=photo.id)

            training_data.finalize()

            if not training_data:
                raise ValueError("Training data cannot be empty")

            # Round feature data only if BOW is not applied.
            if not use_bow:
                training_data.round_input(6)

            # Write data rows.
            for photo_id, input_, output in training_data:
                row = [str(photo_id)]
                row.extend(input_.astype(str))
                row.extend(output.astype(str))
                fh.write("%s\n" % "\t".join(row))

        logging.info("Training data written to %s", filename)