Exemplo n.º 1
0
def reAndThum(filePath, outFolder, outSize=(200, 200)):
    global failedList
    # 尝试处理该文件
    flag1, outPath = reName(filePath)
    if flag1 == -1 and lock.acquire():
        # 更新错误列表
        failedList.append(filePath)
        print(os.path.basename(outPath), "failed.")
        print()
        lock.release()
    else:
        flag2, thumbPath = mkThumb(outPath, outFolder, outSize)
        if flag2 == -1 and lock.acquire():
            # 更新错误列表
            failedList.append(filePath)
            print(os.path.basename(outPath), "failed.")
            print()
            lock.release()
        else:
            # 向图像匹配库中添加该图片(使用缩略图)
            try:
                es = Elasticsearch()
                ses = SignatureES(es)
                ses.add_image(thumbPath)
            except Exception:
                if lock.acquire():
                    failedList.append(filePath)
                    print(os.path.basename(outPath),
                          "Failed to add to image-match database.")
                    print()
                    lock.release()
Exemplo n.º 2
0
def add_imgs():
    gis = ImageSignature()
    a = gis.generate_signature(
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'
    )
    b = gis.generate_signature(
        'https://upload.wikimedia.org/wikipedia/commons/thumb/9/99/Gioconda_%28copia_del_Museo_del_Prado_restaurada%29.jpg/800px-Gioconda_%28copia_del_Museo_del_Prado_restaurada%29.jpg'
    )
    res = gis.normalized_distance(a, b)
    print(res)

    es = Elasticsearch()
    ses = SignatureES(es)

    mypath = '/var/www/html/boots-market/image/catalog/product'

    ses.add_image(mypath + '/' +
                  'almcdnruimg389x562frfr030awdzpc579240581v1.jpg')
    #ses.add_image('/var/www/html/boots-market/image/catalog/almcdnruimg389x562frfr030awdzpc579240581v1.jpg')
    #ses.add_image('/var/www/html/boots-market/image/catalog/12616562_12123107_800.jpg')

    return

    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

    for file in onlyfiles:
        filedir = mypath + '/' + str(file)
        print('add: ' + filedir)
        ses.add_image(filedir)
Exemplo n.º 3
0
class WorkWithSignatures():
    n_grid = 9
    crop_percentile = (5, 95)
    P = None
    diagonal_neighbors = True
    identical_tolerance = 2 / 255
    n_levels = 2
    search_rotated = False

    es = Elasticsearch(
        ['elasticsearch'],
        port=9200,
    )

    ses = SignatureES(es,
                      n_grid=n_grid,
                      crop_percentile=crop_percentile,
                      diagonal_neighbors=diagonal_neighbors,
                      identical_tolerance=identical_tolerance,
                      n_levels=n_levels)

    def clear_db(self):
        self.es.indices.delete(index='images', ignore=[400, 404])
        self.es = Elasticsearch()
        self.ses = SignatureES(self.es,
                               n_grid=self.n_grid,
                               crop_percentile=self.crop_percentile,
                               diagonal_neighbors=self.diagonal_neighbors,
                               identical_tolerance=self.identical_tolerance,
                               n_levels=self.n_levels)

    def reload_params(self, params):
        self.n_grid = params['n_grid']
        self.crop_percentile = params['crop_percentile']
        self.P = params['P']
        self.diagonal_neighbors = params['diagonal_neighbors']
        self.identical_tolerance = params['identical_tolerance']
        self.n_levels = params['n_levels']
        self.search_rotated = params['search_rotated']
        params.pop("search_rotated", None)
        self.ses = SignatureES(self.es, **params)

    def get_all_params(self):
        return {
            'n_grid': self.n_grid,
            'crop_percentile': self.crop_percentile,
            'P': self.P,
            'diagonal_neighbors': self.diagonal_neighbors,
            'identical_tolerance': self.identical_tolerance,
            'n_levels': self.n_levels,
            'search_rotated': self.search_rotated
        }

    def load_file(self, path):
        self.ses.add_image(path)

    def search_file(self, file_bytes):
        return self.ses.search_image(file_bytes,
                                     bytestream=True,
                                     all_orientations=self.search_rotated)
Exemplo n.º 4
0
def add():
    es = Elasticsearch()
    ses = SignatureES(es)
    ses.add_image(
        'https://upload.wikimedia.org/wikipedia/commons/thumb/e/ec/Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg/687px-Mona_Lisa,_by_Leonardo_da_Vinci,_from_C2RMF_retouched.jpg'
    )
    return 'ok'
Exemplo n.º 5
0
def add_files():
    es = Elasticsearch()
    ses = SignatureES(es)
    n = 0
    for file in get_files():
        logger.info('{0} Adding file {1}'.format(n, file))
        ses.add_image(file)
        n += 1
Exemplo n.º 6
0
def main():
    image_dir = '/home/key/图片/image_search_data'
    es = Elasticsearch(hosts=["127.0.0.1:9200"])
    ses = SignatureES(es, index='images', doc_type='image')

    for file in walk(image_dir):
        ses.add_image(file)
        print('index image: {}'.format(file))
Exemplo n.º 7
0
def imgStoreTest():
    es = Elasticsearch()
    ses = SignatureES(es)
    imagePath = r"D:\konachan\T22\311479.jpg"
    imageID = int(os.path.basename(imagePath).split(".")[0])
    # pbar.set_description(f"Deal with {imageID}")
    # image = cv2.imread(imagePath)
    metadata = {"imageID": imageID}
    ses.add_image(path=imagePath, metadata=metadata)
Exemplo n.º 8
0
    def add_to_es(self, img_dir=""):

        es = Elasticsearch(hosts=[{"host": settings.ELASTICSEARCH_HOST}])
        ses = SignatureES(es, distance_cutoff=0.3)

        dirlist = os.listdir(img_dir)

        for file in dirlist:
            file_ext = "".join(file.split('.')[-1::])
            img_path = img_dir + file
            if file_ext in ('png', 'jpg'):
                print(img_path, 'added.')
                ses.add_image(img_path)
Exemplo n.º 9
0
def storeImage(inDir):
    es = Elasticsearch()
    ses = SignatureES(es)
    # 获取文件列表
    fileList = []
    for folderName, subfolders, fileNames in os.walk(inDir):
        for fileName in fileNames:
            if fileName.endswith(("jpg", "png", "jpeg", "gif")):
                fileList.append(os.path.join(folderName, fileName))
    # 循环处理
    pbar = tqdm(fileList, ncols=100)
    cnt = 0
    for imagePath in pbar:
        cnt += 1
        imageID = int(os.path.basename(imagePath).split(".")[0])
        pbar.set_description(f"Deal with {imageID}")
        # image = cv2.imread(imagePath)
        metadata = {"imageID": imageID}
        ses.add_image(path=imagePath, metadata=metadata)
from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES
import sys
es = Elasticsearch()
ses = SignatureES(es)
ses.add_image(sys.argv[1], metadata=sys.argv[2])
Exemplo n.º 11
0
urls = []
for path in glob.glob('*.txt'):
    with open(path, "r") as f:
        urls.extend(f.readlines())

for i, url in enumerate(urls):
    print(f"{i}/{len(urls)}", url)
    illust_id = urlToIllustId(url)

    meta = {}
    if illust_id is not False:
        meta = {'illust_id': illust_id}
    else:
        meta = None
    r = ses.es.search(
        index="images",
        body={'query': {
            'match': {
                'metadata.illust_id': illust_id
            }
        }})
    if (r['hits']['total'] == 0):
        try:
            ses.add_image(url, metadata=meta)
        except urllib.error.HTTPError as err:
            if err.code == 403 or err.code == 404:
                print('skip because 403 or 404')
            else:
                raise err
Exemplo n.º 12
0
import os
from glob import glob

from image_match.goldberg import ImageSignature
from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES
# Need to start elastic search $elasticsearch on osx, $sudo service elasticsearch start on ubuntu
"""Originally wanted to remove duplicate images to speed up training with this script but due to a lack of time it was unfinished"""

psychic_learners_dir = os.path.split(os.getcwd())[0]
image_directory = os.path.join(psychic_learners_dir, 'data', 'image',
                               'train_v1')
category_directories = glob(os.path.join(image_directory, '*'))
for category_directory in category_directories:
    image_filenames = glob(os.path.join(category_directory, '*.jpg'))
    es = Elasticsearch()
    ses = SignatureES(es)
    for image_filename in image_filenames:
        ses.add_image(image_filename)
    for image_filename in image_filenames:
        ses.search_image(image_filename)
Exemplo n.º 13
0
class WorkWithSignatures():
    n_grid = 9
    crop_percentile = (5, 95)
    P = None
    diagonal_neighbors = True
    identical_tolerance = 2 / 255
    n_levels = 2
    search_rotated = False

    es = Elasticsearch(
        ['elasticsearch_img'],
        port=9200,
    )
    # es = Elasticsearch()

    ses = SignatureES(es, n_grid=n_grid, crop_percentile=crop_percentile, diagonal_neighbors=diagonal_neighbors,
                      identical_tolerance=identical_tolerance, n_levels=n_levels, distance_cutoff=0.9999)

    def clear_db(self):
        self.es.indices.delete(index='images', ignore=[400, 404])

    def reload_params(self, params):
        self.n_grid = params['n_grid']
        self.crop_percentile = params['crop_percentile']
        self.P = params['P']
        self.diagonal_neighbors = params['diagonal_neighbors']
        self.identical_tolerance = params['identical_tolerance']
        self.n_levels = params['n_levels']
        params.pop("search_rotated", None)
        self.ses = SignatureES(self.es, **params)

    def get_all_params(self):
        return {'n_grid': self.n_grid,
                'crop_percentile': self.crop_percentile,
                'P': self.P,
                'diagonal_neighbors': self.diagonal_neighbors,
                'identical_tolerance': self.identical_tolerance,
                'n_levels': self.n_levels}

    def set_rotate_param(self, rotate):
        self.search_rotated = rotate

    def get_rotate_param(self):
        return self.search_rotated

    def load_file(self, path):
        self.ses.add_image(path)

    def search_file(self, file_bytes):
        return self.ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated)

    def search_file_with_threshold(self, file_bytes, threshold):
        if threshold == 0.0:
           return self.ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated)
        else:
            ses = SignatureES(self.es, distance_cutoff=threshold)
            return ses.search_image(file_bytes, bytestream=True, all_orientations=self.search_rotated)

    def search_file_with_threshold_and_rotated(self, file_bytes, threshold, search_rotated):
        if threshold == 0.0:
           return self.ses.search_image(file_bytes, bytestream=True, all_orientations=search_rotated)
        else:
            ses = SignatureES(self.es, distance_cutoff=threshold)
            return ses.search_image(file_bytes, bytestream=True, all_orientations=search_rotated)

    def get_summary_count(self):
        return self.es.search(index="images*", size=0)['hits']['total']

    def delete_file_from_es(self, path):
        matching_paths = [item['_id'] for item in
                          self.es.search(body={'query':
                                               {'match':
                                                {'path': path}
                                               }
                                              },
                                         index='images')['hits']['hits']
                          if item['_source']['path'] == path]
        if len(matching_paths) > 0:
            for id_tag in matching_paths:
                self.es.delete(index='images', doc_type='image', id=id_tag)
        else:
            raise Exception("File does not exists")

    def delete_duplicate_signature(self):
        all_data = self.es.search(index="images", body={"query": {"match_all": {}}})
        ids_and_sings = [(d['_id'], d['_source']['signature']) for d in all_data['hits']['hits']]
        to_delete =[elem[0] for index, elem in enumerate(ids_and_sings) for j in ids_and_sings[index+1:] if numpy.array_equal(elem[1], j[1])]
        for id_tag in set(to_delete):
            self.es.delete(index='images', doc_type='image', id=id_tag)
        paths = [d['_source']['path'] for d in all_data['hits']['hits']]
        for path in paths:
            self.ses.delete_duplicates(path)
Exemplo n.º 14
0
class ImageFinder():
    def __init__(self):
        self.BASE_DIR = None
        self.DISTANCE_CUTOFF = 0.4
        self.es = Elasticsearch()
        self.ses = SignatureES(self.es, distance_cutoff=self.DISTANCE_CUTOFF)
        self.index_name = "images"

    def es_iterate_all_documents(self,
                                 index,
                                 pagesize=250,
                                 scroll_timeout="1m",
                                 **kwargs):
        """
        https://techoverflow.net/2019/05/07/elasticsearch-how-to-iterate-scroll-through-all-documents-in-index/
        Helper to iterate ALL values from a single index
        Yields all the documents.
        """
        is_first = True
        while True:
            # Scroll next
            if is_first:  # Initialize scroll
                result = self.es.search(index=index,
                                        scroll="1m",
                                        **kwargs,
                                        body={"size": pagesize})
                is_first = False
            else:
                result = self.es.scroll(body={
                    "scroll_id": scroll_id,
                    "scroll": scroll_timeout
                })
            scroll_id = result["_scroll_id"]
            hits = result["hits"]["hits"]
            # Stop after no more docs
            if not hits:
                break
            # Yield each entry
            yield from (hit['_source'] for hit in hits)

    def get_similar_groups(self):
        for entry in self.es_iterate_all_documents(index=self.index_name):
            similar = self.ses.search_image(entry['path'])
            if len(similar) != 1:
                yield [record for record in similar]

    def add_images(self):
        # self.es.indices.create(index=self.index_name, ignore=400)
        try:
            self.es.indices.create(index=self.index_name)
        except ElasticsearchException as es1:
            raise es1
            # with open('err_log.txt', mode='a') as log:
            #     log.write(str(es1))

        img_formats = ('jpeg', 'jpg', 'png')

        for root, dirs, files in os.walk(self.BASE_DIR):
            for name in files:
                fn = os.path.abspath(join(root, name))
                *_, frmt = name.split('.')
                if (frmt.lower() in img_formats):
                    try:
                        self.ses.add_image(fn)
                        yield fn
                    except Exception as err:
                        raise err
                        # with open('err_log.txt', mode='a') as log:
                        #     log.write(fn)
                        #     log.write(str(err))

    def delete_index(self):
        self.es.indices.delete(index=self.index_name, ignore=[400, 404])

    def delete_doc(self, id):
        self.es.delete(index=self.index_name, doc_type='image', id=id)

    def add_doc(self, file_path):
        self.ses.add_image(file_path)

    def is_index_created(self):
        return self.es.indices.exists(index=self.index_name)
Exemplo n.º 15
0
from os import listdir
from os.path import isfile, join

# path_to_directory = "/home/jamesqiu/Desktop/test_in_lab"
path_to_directory = "/home/jamesqiu/Desktop/test-set2"

images = [f for f in listdir(path_to_directory) if isfile(join(path_to_directory, f)) and f[0] != '.']

from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES

es = Elasticsearch()
ses = SignatureES(es)

print("Adding images to the database...")
count = 0
for image in images:
    count += 1
    print(image + " - [ " + str(count) + " / " + str(len(images)) + " ]")
    ses.add_image(path_to_directory + "/" + image)
print("Done")
import os
from os import listdir
from os.path import isfile, join
import sys
from elasticsearch import Elasticsearch
from image_match.elasticsearch_driver import SignatureES

es = Elasticsearch()
ses = SignatureES(es, index='amcomamil')

files = [
    file for file in listdir('ImagensOriginais')
    if isfile(join('ImagensOriginais', file))
]

for f in files:
    file_path = os.path.abspath('ImagensOriginais/' + f)
    print('Indexando imagem ' + file_path)
    ses.add_image(file_path)
class ElasticSearchStoreBackend(ImageSignatureStore):
    DEFAULT_EL_DOC_TYPE_EL_6 = 'image'
    DEFAULT_EL_DOC_TYPE_EL_7 = '_doc'

    def __init__(self,
                 host: str,
                 port: int,
                 el_index: str,
                 el_version: int = None,
                 el_doctype: str = None,
                 max_dist: float = 0.03,
                 use_exif_data: bool = True,
                 setup_database: bool = True):
        """
        Image signature persistence backed by image_match and elasticsearch

        :param host: host address of the elasticsearch server
        :param port: port of the elasticsearch server
        :param el_version: elasticsearch version
        :param el_index: elasticsearch index where the data is stored
        :param el_doctype: elasticsearch document type of the stored data
        :param max_dist: maximum "difference" allowed, ranging from [0 .. 1] where 0.2 is still a pretty similar image
        """
        super().__init__(use_exif_data)

        self.host = host
        self.port = port

        detected_version = None
        while detected_version is None:
            time.sleep(2)
            detected_version = self._detect_db_version()

        self._el_version = el_version
        if self._el_version is not None and detected_version is not None and self._el_version != detected_version:
            raise AssertionError(
                "Detected database version ({}) does not match expected version ({})".format(detected_version,
                                                                                             self._el_version))

        if detected_version is not None:
            self._el_version = detected_version
        elif self._el_version is None:
            # assume version 6 by default
            self._el_version = 6

        self._el_index = el_index
        if el_doctype is not None:
            self._el_doctype = el_doctype
        else:
            self._el_doctype = self.DEFAULT_EL_DOC_TYPE_EL_6 if self._el_version < 7 else self.DEFAULT_EL_DOC_TYPE_EL_7

        self.setup_database = setup_database
        if setup_database:
            try:
                # self._clear_database()
                self._setup_database()
            except Exception as e:
                logging.exception(e)
                raise AssertionError("Could not setup database")

        # noinspection PyTypeChecker
        self._store = SignatureES(
            es=Elasticsearch(
                hosts=[
                    {'host': self.host, 'port': self.port}
                ]
            ),
            el_version=self._el_version,
            index=self._el_index,
            doc_type=self._el_doctype,
            distance_cutoff=max_dist
        )

    def _detect_db_version(self) -> int or None:
        try:
            response = requests.get('http://{}:{}'.format(self.host, self.port))
            response.raise_for_status()
            return int(str(response.json()["version"]['number']).split(".")[0])
        except Exception as ex:
            logging.exception(ex)
            return None

    def _setup_database(self):
        """
        Creates the expected index, if it does not exist
        """
        response = requests.get('http://{}:{}/{}'.format(self.host, self.port, self._el_index))
        if response.status_code == 200:
            return
        elif response.status_code == 404:

            properties = {
                "properties": {
                    "path": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            }

            if self._el_version == 7:
                json_data = {
                    "mappings": properties
                }
            else:
                json_data = {
                    "mappings": {
                        self._el_doctype: properties
                    }
                }

            response = requests.put(
                url='http://{}:{}/{}'.format(self.host, self.port, self._el_index),
                json=json_data
            )

            response.raise_for_status()
        else:
            response.raise_for_status()

    def _clear_database(self):
        """
        Removes the index and all data it contains
        """
        requests.delete('http://{}:{}/{}'.format(self.host, self.port, self._el_index))

    def _add(self, image_file_path: str, image_data: dict) -> None:
        # remove existing entries
        self.remove(image_file_path)
        self._store.add_image(image_file_path, metadata=image_data)

    def get(self, image_file_path: str) -> dict or None:
        """
        Get a store entry by it's file_path
        :param image_file_path: file path to search for
        :return:
        """
        db_entity = self._get(image_file_path)
        return db_entity

    def _get(self, image_file_path: str) -> dict or None:
        """
        Get a store entry by it's file_path
        :param image_file_path: file path to search for
        :return: elasticsearch result dictionary
        """
        es_query = {
            'query': {
                "constant_score": {
                    "filter": {
                        "term": {'path': image_file_path}
                    }
                }
            }
        }

        query_result = self._store.es.search(index=self._el_index, body=es_query)

        hits = query_result['hits']['hits']

        if len(hits) > 1:
            echo(f"WARNING: More than a single entry for a file, cleaning up: {image_file_path}", color='yellow')
            self.remove(image_file_path)
            self.add(image_file_path)

        if len(hits) == 0:
            return None
        else:
            return hits[0]['_source']

    def get_all(self) -> (int, object):
        es_query = {
            "track_total_hits": True,
            'query': {'match_all': {}}
        }

        item_count = self._store.es.search(index=self._el_index, body=es_query, size=0)['hits']['total']
        if self._el_version >= 7:
            item_count = item_count['value']

        from elasticsearch.helpers import scan

        el6_params = {
            "doc_type": self._el_doctype
        }
        return item_count, scan(
            self._store.es,
            index=self._el_index,
            preserve_order=True,
            query=es_query,
            **(el6_params if self._el_version < 7 else {})
        )

    def find_similar(self, reference_image_file_path: str) -> []:
        try:
            entry = self._get(reference_image_file_path)
            if entry is not None:
                result = []
                rec = self._store.search_single_record(entry)
                result.extend(rec)

                return result
            else:
                return self._store.search_image(reference_image_file_path, all_orientations=True)
        except Exception as e:
            echo(f"Error querying database for similar images of '{reference_image_file_path}': {e}", color="red")
            return []

    def search_metadata(self, metadata: dict) -> []:
        """
        Search for images with metadata properties.

        Note: Metadata will be empty if you did not provide it when adding an image
        :param metadata:
        :return:
        """
        search_dict = {}
        for key, value in metadata.items():
            search_dict[f"metadata.{key}"] = value

        es_query = {
            'query': {'match': search_dict}
        }

        return self._store.es.search(index=self._el_index, body=es_query)

    def remove(self, image_file_path: str) -> None:
        # NOTE: this query will only work if the index has been created
        # with a custom mapping for the path property:

        # # remove existing index
        # curl -X DELETE "192.168.2.24:9200/images"
        #
        # # create index with custom mapping for "path"
        # curl -X PUT "192.168.2.24:9200/images?pretty" -H "Content-Type: application/json" -d
        # "
        # {
        #   "mappings": {
        #     "image": {
        #       "properties": {
        #         "path": {
        #           "type": "keyword",
        #           "ignore_above": 256
        #         }
        #       }
        #     }
        #   }
        # }
        # "

        es_query = {
            'query': {
                "constant_score": {
                    "filter": {
                        "term": {'path': image_file_path}
                    }
                }
            }
        }

        self._remove_by_query(es_query)

    def remove_all(self) -> None:
        es_query = {
            'query': {'match_all': {}}
        }

        self._remove_by_query(es_query)

    def _remove_by_query(self, es_query: dict):
        el6_params = {
            "doc_type": self._el_doctype
        }

        return self._store.es.delete_by_query(
            index=self._el_index,
            body=es_query,
            conflicts="proceed",
            **(el6_params if self._el_version < 7 else {})
        )