parser.add_argument('-c', '--count', type=int, default=100, help='Amount of query result images to fetch; images without \ metadata will be skipped so fewer images may be downloaded') parser.add_argument('-u', '--uris', help='Download images and metadata from existing list of Wikimedia \ Commons uris rather than querying them first') parser.add_argument('-d', '--directory', default='data/fetch/<timestamp>-commons', help='Directory to download images into; gets created if not exists') args = parser.parse_args() timestamp = str(datetime.now().strftime('%y-%m-%d-%H-%M')) directory = args.directory.replace('<timestamp>', timestamp) uris = [] if args.uris: name = os.path.basename(args.uris) uris = read_lines(args.uris) elif args.metadata_query: name = args.metadata_query[0] uris = fetch_uris_from_metadata(args.metadata_query, args.count) elif args.article_query: name = args.article_query[0] uris = fetch_uris_from_articles(args.article_query, args.count) else: assert False, ('One of parameters --uris, --metadata-query and ' + '--article-query is required') ensure_directory(directory) print('Download', len(uris), 'images and metadata into', directory) images_and_metadata(uris, directory)
parser.add_argument('-c', '--count', type=int, default=100, help='Amount of query result images to fetch; images without \ metadata will be skipped so fewer images may be downloaded') parser.add_argument('-u', '--uris', help='Download images and metadata from existing list of Wikimedia \ Commons uris rather than querying them first') parser.add_argument('-d', '--directory', default='data/fetch/<timestamp>-commons', help='Directory to download images into; gets created if not exists') args = parser.parse_args() timestamp = str(datetime.now().strftime('%y-%m-%d-%H-%M')) directory = args.directory.replace('<timestamp>', timestamp) uris = [] if args.uris: name = os.path.basename(args.uris) uris = read_lines(args.uris) elif args.metadata_query: name = args.metadata_query[0] uris = fetch_uris_from_metadata(args.metadata_query, args.count) elif args.article_query: name = args.article_query[0] uris = fetch_uris_from_articles(args.article_query, args.count) else: assert False, ('One of parameters --uris, --metadata-query and ' + '--article-query is required') ensure_directory(directory) print('Download', len(uris), 'images and metadata into', directory) images_and_metadata(uris, directory)
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config['FACE_CLASSIFIER'])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append( WordsFeature.create_from(app.config['WORDS_CONFIG'])) return extractors def create_response_entry(label, sample): return { 'thumbnail': sample.thumbnail, 'image': sample.url, 'label': label, 'title': sample.url } def create_response(entries): return {'current': 100, 'total': 100, 'result': entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config['QUERY_LIMIT']: limit = app.config['QUERY_LIMIT'] searchterm = ' '.join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config['DATASET_CONFIG'])) dataset.means = dataset_config['means'] dataset.stds = dataset_config['stds'] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER']) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config["FACE_CLASSIFIER"])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append(WordsFeature.create_from(app.config["WORDS_CONFIG"])) return extractors def create_response_entry(label, sample): return {"thumbnail": sample.thumbnail, "image": sample.url, "label": label, "title": sample.url} def create_response(entries): return {"current": 100, "total": 100, "result": entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config["QUERY_LIMIT"]: limit = app.config["QUERY_LIMIT"] searchterm = " ".join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config["DOWNLOAD_DIRECTORY"], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config["DATASET_CONFIG"])) dataset.means = dataset_config["means"] dataset.stds = dataset_config["stds"] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config["WIKIMEDIA_CLASSIFIER"]) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result