if __name__ == '__main__': parser = ArgumentParser(description='Extract features of a dataset.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('dataset', help='Path to the directory containing folders for each class that \ contain the images and metadata') parser.add_argument('-i', '--image', action='store_true', help='Whether to extract visual features from the image') parser.add_argument('-t', '--textual', action='store_true', help='Whether to extract textual features; requires JSON metadata \ files for all the images') parser.add_argument('-v', '--vocabulary', default='<dataset>/vocabulary.json', help='Filename of the JSON vocabulary used for bag of words') parser.add_argument('-o', '--output', default='<dataset>/features.json', help='Where to store the extracted features') args = parser.parse_args() args.output = args.output.replace('<dataset>', args.dataset) args.vocabulary = args.vocabulary.replace('<dataset>', args.dataset) assert args.image or args.textual, 'Need at least one feature source' if args.textual: assert args.vocabulary, 'Vocabulary is needed for textual features' WordsFeature.load_vocabulary(args.vocabulary) dataset = Dataset() dataset.read(args.dataset, args.image, args.textual) dataset.save(args.output)
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config['FACE_CLASSIFIER'])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append( WordsFeature.create_from(app.config['WORDS_CONFIG'])) return extractors def create_response_entry(label, sample): return { 'thumbnail': sample.thumbnail, 'image': sample.url, 'label': label, 'title': sample.url } def create_response(entries): return {'current': 100, 'total': 100, 'result': entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config['QUERY_LIMIT']: limit = app.config['QUERY_LIMIT'] searchterm = ' '.join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config['DATASET_CONFIG'])) dataset.means = dataset_config['means'] dataset.stds = dataset_config['stds'] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER']) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config["FACE_CLASSIFIER"])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append(WordsFeature.create_from(app.config["WORDS_CONFIG"])) return extractors def create_response_entry(label, sample): return {"thumbnail": sample.thumbnail, "image": sample.url, "label": label, "title": sample.url} def create_response(entries): return {"current": 100, "total": 100, "result": entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config["QUERY_LIMIT"]: limit = app.config["QUERY_LIMIT"] searchterm = " ".join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config["DOWNLOAD_DIRECTORY"], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config["DATASET_CONFIG"])) dataset.means = dataset_config["means"] dataset.stds = dataset_config["stds"] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config["WIKIMEDIA_CLASSIFIER"]) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result
args = parser.parse_args() args.output = args.output.replace('<dataset>', args.dataset) assert args.visual or args.textual, 'Need at least one feature source' extractors = [] if args.visual: extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) # extractors.append(BlobFeature()) # extractors.append(BriefFeature()) if os.path.isfile(args.trained_faces): extractors.append(FaceFeature(args.trained_faces)) else: print('Skip face feature since the trained face detector was not ' 'found at {}.'.format(args.trained_faces)) if args.textual: samples = read_samples(args.dataset) if os.path.isfile(args.stopwords): args.stopwords = open(args.stopwords) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append(WordsFeature(samples, args.stopwords)) # extractors.append(RandomFeature()) dataset = Dataset(logging=True) dataset.read(args.dataset, extractors) dataset.save(args.output)