def getmatches(pictures, cache_path, threshold, match_scaled=False, j=job.nulljob): def get_picinfo(p): if match_scaled: return (None, p.is_ref) else: return (p.dimensions, p.is_ref) def collect_results(collect_all=False): # collect results and wait until the queue is small enough to accomodate a new results. nonlocal async_results, matches, comparison_count, comparisons_to_do limit = 0 if collect_all else RESULTS_QUEUE_LIMIT while len(async_results) > limit: ready, working = extract(lambda r: r.ready(), async_results) for result in ready: matches += result.get() async_results.remove(result) comparison_count += 1 # About the NOQA below: I think there's a bug in pyflakes. To investigate... progress_msg = tr("Performed %d/%d chunk matches") % ( comparison_count, len(comparisons_to_do)) # NOQA j.set_progress(comparison_count, progress_msg) j = j.start_subjob([3, 7]) pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j) j = j.start_subjob([9, 1], tr("Preparing for matching")) cache = get_cache(cache_path) id2picture = {} for picture in pictures: try: picture.cache_id = cache.get_id(picture.unicode_path) id2picture[picture.cache_id] = picture except ValueError: pass cache.close() pictures = [p for p in pictures if hasattr(p, 'cache_id')] pool = multiprocessing.Pool() async_results = [] matches = [] chunks = get_chunks(pictures) # We add a None element at the end of the chunk list because each chunk has to be compared # with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once. comparisons_to_do = list(combinations(chunks + [None], 2)) comparison_count = 0 j.start_job(len(comparisons_to_do)) try: for ref_chunk, other_chunk in comparisons_to_do: picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk} ref_ids = [p.cache_id for p in ref_chunk] if other_chunk is not None: other_ids = [p.cache_id for p in other_chunk] picinfo.update( {p.cache_id: get_picinfo(p) for p in other_chunk}) else: other_ids = None args = (ref_ids, other_ids, cache_path, threshold, picinfo) async_results.append(pool.apply_async(async_compare, args)) collect_results() collect_results(collect_all=True) except MemoryError: # Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us # some wiggle room, log about the incident, and stop matching right here. We then process # the matches we have. The rest of the process doesn't allocate much and we should be # alright. del comparisons_to_do, chunks, pictures # some wiggle room for the next statements logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches)) del matches[ -len(matches) // 3:] # some wiggle room to ensure we don't run out of memory again. pool.close() result = [] myiter = j.iter_with_progress( iterconsume(matches, reverse=False), tr("Verified %d/%d matches"), every=10, count=len(matches), ) for ref_id, other_id, percentage in myiter: ref = id2picture[ref_id] other = id2picture[other_id] if percentage == 100 and ref.md5 != other.md5: percentage = 99 if percentage >= threshold: ref.dimensions # pre-read dimensions for display in results other.dimensions result.append(get_match(ref, other, percentage)) return result
def getmatches(pictures, cache_path, threshold=75, match_scaled=False, j=job.nulljob): def get_picinfo(p): if match_scaled: return (None, p.is_ref) else: return (p.dimensions, p.is_ref) def collect_results(collect_all=False): # collect results and wait until the queue is small enough to accomodate a new results. nonlocal async_results, matches, comparison_count limit = 0 if collect_all else RESULTS_QUEUE_LIMIT while len(async_results) > limit: ready, working = extract(lambda r: r.ready(), async_results) for result in ready: matches += result.get() async_results.remove(result) comparison_count += 1 progress_msg = tr("Performed %d/%d chunk matches") % (comparison_count, len(comparisons_to_do)) j.set_progress(comparison_count, progress_msg) j = j.start_subjob([3, 7]) pictures = prepare_pictures(pictures, cache_path, with_dimensions=not match_scaled, j=j) j = j.start_subjob([9, 1], tr("Preparing for matching")) cache = Cache(cache_path) id2picture = {} for picture in pictures: try: picture.cache_id = cache.get_id(picture.unicode_path) id2picture[picture.cache_id] = picture except ValueError: pass cache.close() pictures = [p for p in pictures if hasattr(p, 'cache_id')] pool = multiprocessing.Pool() async_results = [] matches = [] chunks = get_chunks(pictures) # We add a None element at the end of the chunk list because each chunk has to be compared # with itself. Thus, each chunk will show up as a ref_chunk having other_chunk set to None once. comparisons_to_do = list(combinations(chunks + [None], 2)) comparison_count = 0 j.start_job(len(comparisons_to_do)) try: for ref_chunk, other_chunk in comparisons_to_do: picinfo = {p.cache_id: get_picinfo(p) for p in ref_chunk} ref_ids = [p.cache_id for p in ref_chunk] if other_chunk is not None: other_ids = [p.cache_id for p in other_chunk] picinfo.update({p.cache_id: get_picinfo(p) for p in other_chunk}) else: other_ids = None args = (ref_ids, other_ids, cache_path, threshold, picinfo) async_results.append(pool.apply_async(async_compare, args)) collect_results() collect_results(collect_all=True) except MemoryError: # Rare, but possible, even in 64bit situations (ref #264). What do we do now? We free us # some wiggle room, log about the incident, and stop matching right here. We then process # the matches we have. The rest of the process doesn't allocate much and we should be # alright. del comparisons_to_do, chunks, pictures # some wiggle room for the next statements logging.warning("Ran out of memory when scanning! We had %d matches.", len(matches)) del matches[-len(matches)//3:] # some wiggle room to ensure we don't run out of memory again. pool.close() result = [] myiter = j.iter_with_progress( iterconsume(matches, reverse=False), tr("Verified %d/%d matches"), every=10, count=len(matches), ) for ref_id, other_id, percentage in myiter: ref = id2picture[ref_id] other = id2picture[other_id] if percentage == 100 and ref.md5 != other.md5: percentage = 99 if percentage >= threshold: ref.dimensions # pre-read dimensions for display in results other.dimensions result.append(get_match(ref, other, percentage)) return result