def reAndThum(filePath, outFolder, outSize=(200, 200)): global failedList # 尝试处理该文件 flag1, outPath = reName(filePath) if flag1 == -1 and lock.acquire(): # 更新错误列表 failedList.append(filePath) print(os.path.basename(outPath), "failed.") print() lock.release() else: flag2, thumbPath = mkThumb(outPath, outFolder, outSize) if flag2 == -1 and lock.acquire(): # 更新错误列表 failedList.append(filePath) print(os.path.basename(outPath), "failed.") print() lock.release() else: # 向图像匹配库中添加该图片(使用缩略图) try: es = Elasticsearch() ses = SignatureES(es) ses.add_image(thumbPath) except Exception: if lock.acquire(): failedList.append(filePath) print(os.path.basename(outPath), "Failed to add to image-match database.") print() lock.release()
def add_imgs(): gis = ImageSignature() a = gis.generate_signature( 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg' ) b = gis.generate_signature( 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/99/Gioconda_%28copia_del_Museo_del_Prado_restaurada%29.jpg/800px-Gioconda_%28copia_del_Museo_del_Prado_restaurada%29.jpg' ) res = gis.normalized_distance(a, b) print(res) es = Elasticsearch() ses = SignatureES(es) mypath = '/var/www/html/boots-market/image/catalog/product' ses.add_image(mypath + '/' + 'almcdnruimg389x562frfr030awdzpc579240581v1.jpg') #ses.add_image('/var/www/html/boots-market/image/catalog/almcdnruimg389x562frfr030awdzpc579240581v1.jpg') #ses.add_image('/var/www/html/boots-market/image/catalog/12616562_12123107_800.jpg') return onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] for file in onlyfiles: filedir = mypath + '/' + str(file) print('add: ' + filedir) ses.add_image(filedir)
class WorkWithSignatures(): n_grid = 9 crop_percentile = (5, 95) P = None diagonal_neighbors = True identical_tolerance = 2 / 255 n_levels = 2 search_rotated = False es = Elasticsearch( ['elasticsearch'], port=9200, ) ses = SignatureES(es, n_grid=n_grid, crop_percentile=crop_percentile, diagonal_neighbors=diagonal_neighbors, identical_tolerance=identical_tolerance, n_levels=n_levels) def clear_db(self): self.es.indices.delete(index='images', ignore=[400, 404]) self.es = Elasticsearch() self.ses = SignatureES(self.es, n_grid=self.n_grid, crop_percentile=self.crop_percentile, diagonal_neighbors=self.diagonal_neighbors, identical_tolerance=self.identical_tolerance, n_levels=self.n_levels) def reload_params(self, params): self.n_grid = params['n_grid'] self.crop_percentile = params['crop_percentile'] self.P = params['P'] self.diagonal_neighbors = params['diagonal_neighbors'] self.identical_tolerance = params['identical_tolerance'] self.n_levels = params['n_levels'] self.search_rotated = params['search_rotated'] params.pop("search_rotated", None) self.ses = SignatureES(self.es, **params) def get_all_params(self): return { 'n_grid': self.n_grid, 'crop_percentile': self.crop_percentile, 'P': self.P, 'diagonal_neighbors': self.diagonal_neighbors, 'identical_tolerance': self.identical_tolerance, 'n_levels': self.n_levels, 'search_rotated': self.search_rotated } def load_file(self, path): self.ses.add_image(path) def search_file(self, file_bytes): return self.ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated)
def add(): es = Elasticsearch() ses = SignatureES(es) ses.add_image( 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg' ) return 'ok'
def add_files(): es = Elasticsearch() ses = SignatureES(es) n = 0 for file in get_files(): logger.info('{0} Adding file {1}'.format(n, file)) ses.add_image(file) n += 1
def main(): image_dir = '/home/key/图片/image_search_data' es = Elasticsearch(hosts=["127.0.0.1:9200"]) ses = SignatureES(es, index='images', doc_type='image') for file in walk(image_dir): ses.add_image(file) print('index image: {}'.format(file))
def imgStoreTest(): es = Elasticsearch() ses = SignatureES(es) imagePath = r"D:\konachan\T22\311479.jpg" imageID = int(os.path.basename(imagePath).split(".")[0]) # pbar.set_description(f"Deal with {imageID}") # image = cv2.imread(imagePath) metadata = {"imageID": imageID} ses.add_image(path=imagePath, metadata=metadata)
def add_to_es(self, img_dir=""): es = Elasticsearch(hosts=[{"host": settings.ELASTICSEARCH_HOST}]) ses = SignatureES(es, distance_cutoff=0.3) dirlist = os.listdir(img_dir) for file in dirlist: file_ext = "".join(file.split('.')[-1::]) img_path = img_dir + file if file_ext in ('png', 'jpg'): print(img_path, 'added.') ses.add_image(img_path)
def storeImage(inDir): es = Elasticsearch() ses = SignatureES(es) # 获取文件列表 fileList = [] for folderName, subfolders, fileNames in os.walk(inDir): for fileName in fileNames: if fileName.endswith(("jpg", "png", "jpeg", "gif")): fileList.append(os.path.join(folderName, fileName)) # 循环处理 pbar = tqdm(fileList, ncols=100) cnt = 0 for imagePath in pbar: cnt += 1 imageID = int(os.path.basename(imagePath).split(".")[0]) pbar.set_description(f"Deal with {imageID}") # image = cv2.imread(imagePath) metadata = {"imageID": imageID} ses.add_image(path=imagePath, metadata=metadata)
from elasticsearch import Elasticsearch from image_match.elasticsearch_driver import SignatureES import sys es = Elasticsearch() ses = SignatureES(es) ses.add_image(sys.argv[1], metadata=sys.argv[2])
urls = [] for path in glob.glob('*.txt'): with open(path, "r") as f: urls.extend(f.readlines()) for i, url in enumerate(urls): print(f"{i}/{len(urls)}", url) illust_id = urlToIllustId(url) meta = {} if illust_id is not False: meta = {'illust_id': illust_id} else: meta = None r = ses.es.search( index="images", body={'query': { 'match': { 'metadata.illust_id': illust_id } }}) if (r['hits']['total'] == 0): try: ses.add_image(url, metadata=meta) except urllib.error.HTTPError as err: if err.code == 403 or err.code == 404: print('skip because 403 or 404') else: raise err
import os from glob import glob from image_match.goldberg import ImageSignature from elasticsearch import Elasticsearch from image_match.elasticsearch_driver import SignatureES # Need to start elastic search $elasticsearch on osx, $sudo service elasticsearch start on ubuntu """Originally wanted to remove duplicate images to speed up training with this script but due to a lack of time it was unfinished""" psychic_learners_dir = os.path.split(os.getcwd())[0] image_directory = os.path.join(psychic_learners_dir, 'data', 'image', 'train_v1') category_directories = glob(os.path.join(image_directory, '*')) for category_directory in category_directories: image_filenames = glob(os.path.join(category_directory, '*.jpg')) es = Elasticsearch() ses = SignatureES(es) for image_filename in image_filenames: ses.add_image(image_filename) for image_filename in image_filenames: ses.search_image(image_filename)
class WorkWithSignatures(): n_grid = 9 crop_percentile = (5, 95) P = None diagonal_neighbors = True identical_tolerance = 2 / 255 n_levels = 2 search_rotated = False es = Elasticsearch( ['elasticsearch_img'], port=9200, ) # es = Elasticsearch() ses = SignatureES(es, n_grid=n_grid, crop_percentile=crop_percentile, diagonal_neighbors=diagonal_neighbors, identical_tolerance=identical_tolerance, n_levels=n_levels, distance_cutoff=0.9999) def clear_db(self): self.es.indices.delete(index='images', ignore=[400, 404]) def reload_params(self, params): self.n_grid = params['n_grid'] self.crop_percentile = params['crop_percentile'] self.P = params['P'] self.diagonal_neighbors = params['diagonal_neighbors'] self.identical_tolerance = params['identical_tolerance'] self.n_levels = params['n_levels'] params.pop("search_rotated", None) self.ses = SignatureES(self.es, **params) def get_all_params(self): return {'n_grid': self.n_grid, 'crop_percentile': self.crop_percentile, 'P': self.P, 'diagonal_neighbors': self.diagonal_neighbors, 'identical_tolerance': self.identical_tolerance, 'n_levels': self.n_levels} def set_rotate_param(self, rotate): self.search_rotated = rotate def get_rotate_param(self): return self.search_rotated def load_file(self, path): self.ses.add_image(path) def search_file(self, file_bytes): return self.ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated) def search_file_with_threshold(self, file_bytes, threshold): if threshold == 0.0: return self.ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated) else: ses = SignatureES(self.es, distance_cutoff=threshold) return ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated) def search_file_with_threshold_and_rotated(self, file_bytes, threshold, search_rotated): if threshold == 0.0: return self.ses.search_image(file_bytes, bytestream=True, all_orientations=search_rotated) else: ses = SignatureES(self.es, distance_cutoff=threshold) return ses.search_image(file_bytes, bytestream=True, all_orientations=search_rotated) def get_summary_count(self): return self.es.search(index="images*", size=0)['hits']['total'] def delete_file_from_es(self, path): matching_paths = [item['_id'] for item in self.es.search(body={'query': {'match': {'path': path} } }, index='images')['hits']['hits'] if item['_source']['path'] == path] if len(matching_paths) > 0: for id_tag in matching_paths: self.es.delete(index='images', doc_type='image', id=id_tag) else: raise Exception("File does not exists") def delete_duplicate_signature(self): all_data = self.es.search(index="images", body={"query": {"match_all": {}}}) ids_and_sings = [(d['_id'], d['_source']['signature']) for d in all_data['hits']['hits']] to_delete =[elem[0] for index, elem in enumerate(ids_and_sings) for j in ids_and_sings[index+1:] if numpy.array_equal(elem[1], j[1])] for id_tag in set(to_delete): self.es.delete(index='images', doc_type='image', id=id_tag) paths = [d['_source']['path'] for d in all_data['hits']['hits']] for path in paths: self.ses.delete_duplicates(path)
class ImageFinder(): def __init__(self): self.BASE_DIR = None self.DISTANCE_CUTOFF = 0.4 self.es = Elasticsearch() self.ses = SignatureES(self.es, distance_cutoff=self.DISTANCE_CUTOFF) self.index_name = "images" def es_iterate_all_documents(self, index, pagesize=250, scroll_timeout="1m", **kwargs): """ https://techoverflow.net/2019/05/07/elasticsearch-how-to-iterate-scroll-through-all-documents-in-index/ Helper to iterate ALL values from a single index Yields all the documents. """ is_first = True while True: # Scroll next if is_first: # Initialize scroll result = self.es.search(index=index, scroll="1m", **kwargs, body={"size": pagesize}) is_first = False else: result = self.es.scroll(body={ "scroll_id": scroll_id, "scroll": scroll_timeout }) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] # Stop after no more docs if not hits: break # Yield each entry yield from (hit['_source'] for hit in hits) def get_similar_groups(self): for entry in self.es_iterate_all_documents(index=self.index_name): similar = self.ses.search_image(entry['path']) if len(similar) != 1: yield [record for record in similar] def add_images(self): # self.es.indices.create(index=self.index_name, ignore=400) try: self.es.indices.create(index=self.index_name) except ElasticsearchException as es1: raise es1 # with open('err_log.txt', mode='a') as log: # log.write(str(es1)) img_formats = ('jpeg', 'jpg', 'png') for root, dirs, files in os.walk(self.BASE_DIR): for name in files: fn = os.path.abspath(join(root, name)) *_, frmt = name.split('.') if (frmt.lower() in img_formats): try: self.ses.add_image(fn) yield fn except Exception as err: raise err # with open('err_log.txt', mode='a') as log: # log.write(fn) # log.write(str(err)) def delete_index(self): self.es.indices.delete(index=self.index_name, ignore=[400, 404]) def delete_doc(self, id): self.es.delete(index=self.index_name, doc_type='image', id=id) def add_doc(self, file_path): self.ses.add_image(file_path) def is_index_created(self): return self.es.indices.exists(index=self.index_name)
from os import listdir from os.path import isfile, join # path_to_directory = "/home/jamesqiu/Desktop/test_in_lab" path_to_directory = "/home/jamesqiu/Desktop/test-set2" images = [f for f in listdir(path_to_directory) if isfile(join(path_to_directory, f)) and f[0] != '.'] from elasticsearch import Elasticsearch from image_match.elasticsearch_driver import SignatureES es = Elasticsearch() ses = SignatureES(es) print("Adding images to the database...") count = 0 for image in images: count += 1 print(image + " - [ " + str(count) + " / " + str(len(images)) + " ]") ses.add_image(path_to_directory + "/" + image) print("Done")
import os from os import listdir from os.path import isfile, join import sys from elasticsearch import Elasticsearch from image_match.elasticsearch_driver import SignatureES es = Elasticsearch() ses = SignatureES(es, index='amcomamil') files = [ file for file in listdir('ImagensOriginais') if isfile(join('ImagensOriginais', file)) ] for f in files: file_path = os.path.abspath('ImagensOriginais/' + f) print('Indexando imagem ' + file_path) ses.add_image(file_path)
class ElasticSearchStoreBackend(ImageSignatureStore): DEFAULT_EL_DOC_TYPE_EL_6 = 'image' DEFAULT_EL_DOC_TYPE_EL_7 = '_doc' def __init__(self, host: str, port: int, el_index: str, el_version: int = None, el_doctype: str = None, max_dist: float = 0.03, use_exif_data: bool = True, setup_database: bool = True): """ Image signature persistence backed by image_match and elasticsearch :param host: host address of the elasticsearch server :param port: port of the elasticsearch server :param el_version: elasticsearch version :param el_index: elasticsearch index where the data is stored :param el_doctype: elasticsearch document type of the stored data :param max_dist: maximum "difference" allowed, ranging from [0 .. 1] where 0.2 is still a pretty similar image """ super().__init__(use_exif_data) self.host = host self.port = port detected_version = None while detected_version is None: time.sleep(2) detected_version = self._detect_db_version() self._el_version = el_version if self._el_version is not None and detected_version is not None and self._el_version != detected_version: raise AssertionError( "Detected database version ({}) does not match expected version ({})".format(detected_version, self._el_version)) if detected_version is not None: self._el_version = detected_version elif self._el_version is None: # assume version 6 by default self._el_version = 6 self._el_index = el_index if el_doctype is not None: self._el_doctype = el_doctype else: self._el_doctype = self.DEFAULT_EL_DOC_TYPE_EL_6 if self._el_version < 7 else self.DEFAULT_EL_DOC_TYPE_EL_7 self.setup_database = setup_database if setup_database: try: # self._clear_database() self._setup_database() except Exception as e: logging.exception(e) raise AssertionError("Could not setup database") # noinspection PyTypeChecker self._store = SignatureES( es=Elasticsearch( hosts=[ {'host': self.host, 'port': self.port} ] ), el_version=self._el_version, index=self._el_index, doc_type=self._el_doctype, distance_cutoff=max_dist ) def _detect_db_version(self) -> int or None: try: response = requests.get('http://{}:{}'.format(self.host, self.port)) response.raise_for_status() return int(str(response.json()["version"]['number']).split(".")[0]) except Exception as ex: logging.exception(ex) return None def _setup_database(self): """ Creates the expected index, if it does not exist """ response = requests.get('http://{}:{}/{}'.format(self.host, self.port, self._el_index)) if response.status_code == 200: return elif response.status_code == 404: properties = { "properties": { "path": { "type": "keyword", "ignore_above": 256 } } } if self._el_version == 7: json_data = { "mappings": properties } else: json_data = { "mappings": { self._el_doctype: properties } } response = requests.put( url='http://{}:{}/{}'.format(self.host, self.port, self._el_index), json=json_data ) response.raise_for_status() else: response.raise_for_status() def _clear_database(self): """ Removes the index and all data it contains """ requests.delete('http://{}:{}/{}'.format(self.host, self.port, self._el_index)) def _add(self, image_file_path: str, image_data: dict) -> None: # remove existing entries self.remove(image_file_path) self._store.add_image(image_file_path, metadata=image_data) def get(self, image_file_path: str) -> dict or None: """ Get a store entry by it's file_path :param image_file_path: file path to search for :return: """ db_entity = self._get(image_file_path) return db_entity def _get(self, image_file_path: str) -> dict or None: """ Get a store entry by it's file_path :param image_file_path: file path to search for :return: elasticsearch result dictionary """ es_query = { 'query': { "constant_score": { "filter": { "term": {'path': image_file_path} } } } } query_result = self._store.es.search(index=self._el_index, body=es_query) hits = query_result['hits']['hits'] if len(hits) > 1: echo(f"WARNING: More than a single entry for a file, cleaning up: {image_file_path}", color='yellow') self.remove(image_file_path) self.add(image_file_path) if len(hits) == 0: return None else: return hits[0]['_source'] def get_all(self) -> (int, object): es_query = { "track_total_hits": True, 'query': {'match_all': {}} } item_count = self._store.es.search(index=self._el_index, body=es_query, size=0)['hits']['total'] if self._el_version >= 7: item_count = item_count['value'] from elasticsearch.helpers import scan el6_params = { "doc_type": self._el_doctype } return item_count, scan( self._store.es, index=self._el_index, preserve_order=True, query=es_query, **(el6_params if self._el_version < 7 else {}) ) def find_similar(self, reference_image_file_path: str) -> []: try: entry = self._get(reference_image_file_path) if entry is not None: result = [] rec = self._store.search_single_record(entry) result.extend(rec) return result else: return self._store.search_image(reference_image_file_path, all_orientations=True) except Exception as e: echo(f"Error querying database for similar images of '{reference_image_file_path}': {e}", color="red") return [] def search_metadata(self, metadata: dict) -> []: """ Search for images with metadata properties. Note: Metadata will be empty if you did not provide it when adding an image :param metadata: :return: """ search_dict = {} for key, value in metadata.items(): search_dict[f"metadata.{key}"] = value es_query = { 'query': {'match': search_dict} } return self._store.es.search(index=self._el_index, body=es_query) def remove(self, image_file_path: str) -> None: # NOTE: this query will only work if the index has been created # with a custom mapping for the path property: # # remove existing index # curl -X DELETE "192.168.2.24:9200/images" # # # create index with custom mapping for "path" # curl -X PUT "192.168.2.24:9200/images?pretty" -H "Content-Type: application/json" -d # " # { # "mappings": { # "image": { # "properties": { # "path": { # "type": "keyword", # "ignore_above": 256 # } # } # } # } # } # " es_query = { 'query': { "constant_score": { "filter": { "term": {'path': image_file_path} } } } } self._remove_by_query(es_query) def remove_all(self) -> None: es_query = { 'query': {'match_all': {}} } self._remove_by_query(es_query) def _remove_by_query(self, es_query: dict): el6_params = { "doc_type": self._el_doctype } return self._store.es.delete_by_query( index=self._el_index, body=es_query, conflicts="proceed", **(el6_params if self._el_version < 7 else {}) )