def generateRepostsForAll(self,
                              count_per_post=1,
                              res=None,
                              rot=None,
                              asp=None,
                              crop=None,
                              uid=None,
                              seed=None):
        '''generates reposts for every single non repost image in the image directory'''
        names = list(
            filter(lambda x: '_REPOST_' not in x, self.__imageToHash.keys()))
        self.vPrint('generating ' + str(len(names)) + ' reposts')
        interrupted = False
        try:
            for i, name in enumerate(sorted(names)):
                repname = (str(uid) if uid else '') + '_REPOST_' + name
                if count_per_post == 1:
                    if repname in self.__imageToHash and repname in self.__imageToText:
                        continue
                elif count_per_post > 1:
                    if (str(count_per_post - 1) + repname) in self.__imageToHash and \
                       (str(count_per_post - 1) + repname) in self.__imageToText:
                        continue
                else:
                    return

                if i < 30 or i % 10 == 0:
                    self.vPrint('partial: %5d/%d' % (i, len(names)))

                try:
                    target_path = join(self.img_dir, name)
                    loc = join(self.img_dir, repname)
                    bad_imgs = generate_bad_repost(target_path,
                                                   count=(count_per_post),
                                                   res=res,
                                                   rot=rot,
                                                   asp=asp,
                                                   crop=crop,
                                                   save_loc=loc,
                                                   seed=(seed + i))
                    if not isinstance(bad_imgs, list):
                        bad_imgs = [(repname, bad_imgs)]

                    for newrepname, bad_img in bad_imgs:
                        bad_img_hash = Hasher.hashImage(
                            bad_img, self.__imagehash_method)
                        bad_img_text = OCR.read2Normalized(bad_img)
                        self.__imageToHash[newrepname] = bad_img_hash
                        self.__imageToText[newrepname] = bad_img_text
                except FileNotFoundError as e:
                    print(e)
                    print("skipped an image that doesn't exist")
                    continue
                except UnidentifiedImageError as e:
                    print(e)
                    print('skipped an unidentified image')
                    continue

            self.vPrint('done!')
        except KeyboardInterrupt:
            self.vPrint('interrupted!')
            interrupted = True
        finally:
            self.saveProcessedDataToCache()
            self.vPrint('saved!')
        return not interrupted
    def processData(self, only_cached_files=False, max_capacity=None):
        '''
        Processes all posts and returns two dictionaries in a tuple.
        The first maps image name to hash, and
        the second maps image name to OCR results.

        The results will also be cached in memory within the class and
        will be used in other methods for checking reposts

        Returns:
        A tuple of two dictionaries, first one containing image name to hash mappings
        and second one containing image name to OCR readings.
        '''

        if not only_cached_files:
            files = [
                f for f in listdir(self.img_dir)
                if isfile(join(self.img_dir, f)) and not f.startswith('.')
            ]
            files.sort()
            self.readProcessedDataFromCache()
        else:
            self.readProcessedDataFromCache()
            files = list(self.__imageToHash.keys())
            files.sort()

        if max_capacity is not None:
            files = files[:max_capacity]

        d = self.__imageToHash
        t = self.__imageToText

        self.vPrint("loading... " + str(len(files)) + ' items')
        for i, file in enumerate(files):
            if len(files) < 50 or i % (len(files) // 20) == 0:
                self.vPrint('partial: %5d/%d' % (i, len(files)))

            try:
                if file not in d or file not in t:
                    img = Image.open(join(self.img_dir, file))
                    d[file] = Hasher.hashImage(img, self.__imagehash_method)
                    t[file] = OCR.read2Normalized(img)
            except KeyboardInterrupt:
                self.vPrint('skipped remaining files')
                if file in d:
                    del d[file]
                if file in t:
                    del t[file]
                break
            except UnidentifiedImageError:
                self.vPrint('skipped ' + file + ' (not an image)')
                if file in d:
                    del d[file]
                if file in t:
                    del t[file]

        self.vPrint('loaded: ' + str(len(d.items())) + ' items')
        self.__imageToHash = d
        self.__imageToText = t
        self.saveProcessedDataToCache()
        return (d, t)
    def checkRepostDetection(self,
                             img: str,
                             img_sim_min: int = 0.8,
                             text_sim_min: float = 0.7,
                             recheck_img: bool = True,
                             generate_repost: bool = False,
                             save_generated_repost: bool = True):
        '''
        Checks whether reposts can be detected correctly using
        a naive algorithm considering image hashes and ocr text.

        This assumes the dataset is correctly labelled such that
        a reposted image is the image name prefixed with _REPOST_.

        If an image is custom crafted and you don't want it to
        make a deduction of whether it's a true positive or otherwise,
        simply avoid using the standard format name of:
            <subreddit>_<postID>.<imgExtension>
        '''
        distances = []
        name_dist_dict = {}
        d = self.__imageToHash
        t = self.__imageToText

        target_check = img
        target_path = join(self.img_dir, target_check)
        target_img = None
        self.vPrint('we\'ll process post : ' + target_check)
        if generate_repost or recheck_img:
            target_img = Image.open(target_path)
        if target_img and (recheck_img or target_check not in d
                           or target_check not in t):
            self.vPrint('computing target metadata')
            target_hash = Hasher.hashImage(target_img, self.__imagehash_method)
            target_text = OCR.read2Normalized(target_img)
            target_texthash = Hasher.hashText(target_text)
            d[target_check] = target_hash
            t[target_check] = target_text
            self.__imageToHash = d
            self.__imageToText = t
        else:
            target_hash = d[target_check]
            target_text = t[target_check]

        bad_check = '_REPOST_' + target_check
        if generate_repost:
            self.vPrint('generating dummy repost : _REPOST_' + target_check)
            bad_img = generate_bad_repost(target_path)
            bad_img_path = join(self.img_dir, bad_check)
            self.vPrint('computing target metadata')
            bad_img_hash = Hasher.hashImage(bad_img, self.__imagehash_method)
            bad_img_text = OCR.read2Normalized(bad_img)
            bad_img_texthash = Hasher.hashText(bad_img_text)
            d[bad_check] = bad_img_hash
            t[bad_check] = bad_img_text
            if save_generated_repost:
                bad_img.save(bad_img_path)
                self.__imageToHash = d
                self.__imageToText = t

        if self.update_cache:
            self.saveProcessedDataToCache()

        self.vPrint('\nchecking...')

        for key, value in d.items():
            if key == target_check:
                continue
            img_diff = Hasher.diff(value, target_hash, 'IMAGE')
            text_sim = 0.0 if text_sim_min <= 0.0 else Levenshtein.ratio(
                t[key], target_text)
            distances.append \
                    ( \
                     (key, \
                      img_diff, \
                      text_sim)
                     )
            name_dist_dict[key] = (distances[-1][1], distances[-1][2])

        def orderOfSort(x):
            '''dynamic sorting to prioritise text if image and text are both really close'''
            img_diff = x[1]
            txt_diff = 1 - x[2]
            if txt_diff <= 1 - text_sim_min and img_diff <= 1 - img_sim_min:
                return (txt_diff - 1, img_diff - 1)
            return (img_diff, txt_diff)

        distances.sort(key=orderOfSort)
        counter = 0

        results = {}
        FP = 0
        FN = 0

        self.vPrint('--- similar results ---')
        self.vPrint('  SAME?  | IMG_SIM | TEXT_SIM | IMAGE')
        for a, b, c in distances:
            standardFormat = len(a.split('.')) == 2 and len(
                a.split('.')[0].split('_REPOST_')[-1].split('_')) == 2
            is_known_same = a.split('_REPOST_')[-1] == target_check.split(
                '_REPOST_')[-1]
            is_repost = b <= 1 - img_sim_min and c >= text_sim_min
            if not standardFormat:
                validity = '??'
            else:
                if is_known_same:
                    if is_repost:
                        validity = 'TP'
                    else:
                        validity = 'FN'
                        FN += 1
                else:
                    if is_repost:
                        validity = 'FP'
                        FP += 1
                    else:
                        validity = 'TN'

            if counter < 10:
                counter += 1
                if self.verbose:
                    self.vPrint('%8s   %7.3f   %8.3f    %-50s' % \
                                (('YES, ' if is_repost else ' NO, ') + validity,1-b,c,a))

                    if standardFormat:
                        subreddit = a.split('_REPOST_')[-1].split('_')[0]
                        post_id = a.split('_REPOST_')[-1].split('_')[-1].split(
                            '.')[0]
                        self.vPrint('reddit.com/r/' + subreddit +
                                    '/comments/' + post_id + '/')
                    else:
                        self.vPrint(
                            '• this image isn\'t from the standard dataset')

                    if a == target_check:
                        self.vPrint('• this is the originally chosen image')
                    elif is_known_same:
                        self.vPrint(
                            '• this is a known to be the same as the chosen image'
                        )
                    self.vPrint()

            results[a] = {
                'imgName': a,
                'isRepost': is_repost,
                'validity': validity,
                'imgDiff': b,
                'textSim': c
            }

        if FP or FN:
            self.vPrint('important notes:')
            self.vPrint(
                'we have %d known false positives and %d known false negatives for this\n'
                % (FP, FN))

        return results