예제 #1
0
def process(input_dir, out_dir_clean, out_dir_discard, res: str):
    os.makedirs(out_dir_clean, exist_ok=True)
    os.makedirs(out_dir_discard, exist_ok=True)

    images_cleaned = set(os.listdir(out_dir_clean))
    images_discarded = set(os.listdir(out_dir_discard))

    images_dl = os.listdir(input_dir)
    N = len(images_dl) // NUM_TASKS

    clean = 0
    discarded = 0
    for i, imfile in job_enumerate(images_dl):
        if imfile in images_cleaned:
            clean += 1
            continue
        if imfile in images_discarded:
            discarded += 1
            continue
        im = Image.open(join(input_dir, imfile))
        res = get_res(res)
        im2 = resize_or_discard(im, res, should_clean=True)
        if im2 is not None:
            fn, ext = os.path.splitext(imfile)
            im2.save(join(out_dir_clean, fn + '.jpg'), quality=QUALITY)
            clean += 1
        else:
            im.save(join(out_dir_discard, imfile))
            discarded += 1
        print(f'\r{os.path.basename(input_dir)} -> {os.path.basename(out_dir_clean)} // '
              f'Resized: {clean}/{N}; Discarded: {discarded}/{N}', end='')
    # Done
    print(f'\n{os.path.basename(input_dir)} -> {os.path.basename(out_dir_clean)} // '
          f'Resized: {clean}/{N}; Discarded: {discarded}/{N}')
    def process_all_in(self, input_dir, filter_imgs_dir):
        images_dl = iter_images(input_dir)  # generator of paths

        # files this job should comperss
        files_of_job = [p for _, p in job_enumerate(images_dl)]
        # files that were compressed already by somebody (i.e. this job earlier)
        processed_already = self.images_cleaned | self.images_discarded
        # resulting files to be compressed
        files_of_job = [
            p for p in files_of_job if get_fn(p) not in processed_already
        ]

        if filter_imgs_dir:
            ps_orig = cached_listdir_imgs(filter_imgs_dir,
                                          discard_shitty=True).ps
            fns_to_use = set(map(get_fn, ps_orig))
            print('Filtering with', len(fns_to_use), 'filenames. Before:',
                  len(files_of_job))
            files_of_job = [p for p in files_of_job if get_fn(p) in fns_to_use]
            print('Filtered, now', len(files_of_job))

        N = len(files_of_job)
        if N == 0:
            print('Everything processed / nothing to process.')
            return

        num_process = 2 if NUM_TASKS > 1 else int(
            os.environ.get('MAX_PROCESS', 16))
        print(
            f'Processing {N} images using {num_process} processes in {NUM_TASKS} tasks...'
        )

        start = time.time()
        predicted_time = None
        with multiprocessing.Pool(processes=num_process) as pool:
            for i, clean in enumerate(
                    pool.imap_unordered(self.process, files_of_job)):
                if i > 0 and i % 100 == 0:
                    time_per_img = (time.time() - start) / (i + 1)
                    time_remaining = time_per_img * (N - i)
                    if not predicted_time:
                        predicted_time = time_remaining
                    print(
                        f'\r{time_per_img:.2e} s/img | {i / N * 100:.1f}% | {time_remaining / 60:.1f} min remaining',
                        end='',
                        flush=True)
        if predicted_time:
            print(
                f'Actual time: {(time.time() - start) / 60:.1f} // predicted {predicted_time / 60:.1f}'
            )
def _create_pickle(root_p,
                   pkl_p,
                   distributed_create,
                   create_without_shitty,
                   num_folder_levels=0):
    print(f'Globbing {root_p}...')
    ps = list(iter_images(root_p, num_folder_levels))
    print(f'Found {len(ps)} files!')
    if distributed_create:
        print('--distributed_create, filtering files...')
        ps = [p for _, p in task_array.job_enumerate(ps)]
        print(f'{len(ps)} files left!')
    if create_without_shitty:
        print('--create_without_shitty given')
    database = []
    start = time.time()
    shitty_reasons = defaultdict(int)
    num_processes = int(os.environ.get('MAX_PROCESS',
                                       16)) if not distributed_create else 8
    h = _ProcessHelper(create_without_shitty)
    with multiprocessing.Pool(processes=num_processes) as pool:
        for i, res in enumerate(pool.imap_unordered(h.process, ps)):
            if res is None:
                continue
            database.append(res)
            _, _, shitty_reason = res
            if shitty_reason:
                shitty_reasons[shitty_reason] += 1
            if i > 0 and i % 100 == 0:
                time_per_img = (time.time() - start) / (i + 1)
                time_remaining = time_per_img * (len(ps) - i)
                info = f'\r{time_per_img:.2e} s/img | {i / len(ps) * 100:.1f}% | {time_remaining / 60:.1f} min remaining'
                if shitty_reasons and i % 1000 == 0:
                    info += ' ' + '|'.join(f'{reason}:{count}'
                                           for reason, count in sorted(
                                               shitty_reasons.items(),
                                               key=operator.itemgetter(1)))
                print(info, end='', flush=True)
    info = '|'.join(f'{reason}:{count}'
                    for reason, count in sorted(shitty_reasons.items(),
                                                key=operator.itemgetter(1)))
    print('\nshitty_reasons', info)
    print('Processed all')
    if len(database) == 0:
        raise ValueError(f'No images found in {root_p}!')
    print(f'\nGlobbed {len(database)} images, storing in {pkl_p}!')
    with open(pkl_p, 'wb') as fout:
        pickle.dump(database, fout)
예제 #4
0
def process(input_dir, out_dir, res: str, should_clean):
    inp_base, inp_name = os.path.split(input_dir.rstrip(os.path.sep))
    if out_dir is None:
        out_dir = inp_base
    output_dir = os.path.join(
        out_dir, inp_name + '_' + res + ('_clean' if should_clean else '_all'))
    os.makedirs(output_dir, exist_ok=True)
    # if should_clean, we do not discard because of HSV, but still might because of size!
    discard_dir = os.path.join(
        out_dir, inp_name + '_' + res +
        ('_discard' if should_clean else '_discard_size'))
    os.makedirs(discard_dir, exist_ok=True)

    images_dl = os.listdir(input_dir)
    N = len(images_dl) // NUM_TASKS
    images_done = set(os.listdir(output_dir)) | set(os.listdir(discard_dir))

    clean = 0
    discarded = 0
    skipped = 0
    for i, imfile in job_enumerate(images_dl):
        if imfile in images_done:
            skipped += 1
            continue
        im = Image.open(join(input_dir, imfile))
        res = get_res(res)
        im2 = resize_or_discard(im, res, should_clean=should_clean)
        if im2 is not None:
            fn, ext = os.path.splitext(imfile)
            im2.save(join(output_dir, fn + '.png'))
            clean += 1
        else:
            im.save(join(discard_dir, imfile))
            discarded += 1
        print('\r{} -> {} // Skipped {}/{}, Resized {}/{}, Discarded {}/{}'.
              format(os.path.basename(input_dir), os.path.basename(output_dir),
                     skipped, N, clean, N, discarded, N),
              end='')
    # Done
    print(
        '\n{} -> {} // Skipped {}/{}, Resized {}/{}, Discarded {}/{} // Completed'
        .format(input_dir, output_dir, skipped, N, clean, N, discarded, N))
예제 #5
0
    def process_all_in(self, input_dir):
        images_dl = iter_images(input_dir)  # generator of paths

        # files this job should compress
        files_of_job = [p for _, p in job_enumerate(images_dl)]
        # files that were compressed already by somebody (i.e. this job earlier)
        processed_already = self.images_cleaned | self.images_discarded
        # resulting files to be compressed
        files_of_job = [
            p for p in files_of_job if get_fn(p) not in processed_already
        ]

        N = len(files_of_job)
        if N == 0:
            print('Everything processed / nothing to process.')
            return

        num_process = 2 if NUM_TASKS > 1 else _NUM_PROCESSES
        print(
            f'Processing {N} images using {num_process} processes in {NUM_TASKS} tasks...'
        )

        start = time.time()
        predicted_time = None
        with multiprocessing.Pool(processes=num_process) as pool:
            for i, clean in enumerate(
                    pool.imap_unordered(self.process, files_of_job)):
                if i > 0 and i % 100 == 0:
                    time_per_img = (time.time() - start) / (i + 1)
                    time_remaining = time_per_img * (N - i)
                    if not predicted_time:
                        predicted_time = time_remaining
                    print(
                        f'\r{time_per_img:.2e} s/img | '
                        f'{i / N * 100:.1f}% | '
                        f'{time_remaining / 60:.1f} min remaining',
                        end='',
                        flush=True)
예제 #6
0
def compress(compressor: Compressor, indir, discard_shitty):
    ds = cached_listdir_imgs.cached_listdir_imgs(indir,
                                                 min_size=None,
                                                 discard_shitty=discard_shitty)
    imgs = ds.ps
    compressor.optimal_qs = get_optimal_qs.read(indir)
    if compressor.optimal_qs:
        print('Optimal Qs:', len(compressor.optimal_qs))
    assert len(imgs) > 0, f'no matches for {indir}'
    num_imgs_to_process = len(imgs) // task_array.NUM_TASKS

    images_of_job = [p for _, p in task_array.job_enumerate(imgs)]
    N_orig = len(images_of_job)
    images_of_job = [
        p for p in images_of_job if os.path.splitext(os.path.basename(p))[0]
        not in compressor.files_that_exist
    ]
    N = len(images_of_job)

    start = time.time()
    num_process = 2 if task_array.NUM_TASKS > 1 else int(
        os.environ.get('MAX_PROCESS', 16))
    print(
        f'{task_array.JOB_ID}:',
        f'Compressing {N}/{N_orig} images ({ds.id}) using {num_process} processes',
        f'in {task_array.NUM_TASKS} tasks...')

    with multiprocessing.Pool(processes=num_process) as pool:
        for i, _ in enumerate(
                pool.imap_unordered(compressor.compress, images_of_job)):
            if i > 0 and i % 5 == 0:
                time_per_img = (time.time() - start) / (i + 1)
                time_remaining = time_per_img * (N - i)
                print(
                    f'\r{time_per_img*num_process:.2e} s/img | '
                    f'{i / N * 100:.1f}% | {time_remaining / 60:.1f} min remaining',
                    end='',
                    flush=True)
예제 #7
0
def create_curves_for_images(root_dir, out_dir, grid, mode):
    times = []
    # make sure we exclude here! otherwise, task_array.job_enumerate sees different number of files in every job!!
    all_img_ps = _get_image_paths(root_dir, exclude={'tmp'})
    assert_exc(len(all_img_ps) > 0, 'No images found', ValueError)

    non_pngs = [p for p in all_img_ps if not p.endswith('.png')]
    assert_exc(
        len(non_pngs) == 0,
        f'Only .pngs are supported by this code! Found {len(non_pngs)} others.'
    )

    measure_over_interval = {
        'bpg': bpg_measure_over_interval,
        'balle': balle_measure_over_interval,
        # 'bpgslow':  bpg_measure_over_interval_slow,
        'jp2k': jp2k_measure_over_interval,
        'jp': jp_measure_over_interval,
        'webp': webp_measure_over_interval
    }[mode]

    for i, img_p in task_array.job_enumerate(all_img_ps):
        print('>>>', task_array.TASK_ID, 'compresses', os.path.basename(img_p))
        img_name = os.path.splitext(os.path.basename(img_p))[0]
        s = time.time()
        mf = measures_file_p(out_dir, img_name)
        if complete_measures_file_exists(mf, num_ops=len(grid)):
            print(f'Found output for {img_name}, skipping...')
            continue
        # need to create measures file
        with open(mf, 'w+') as f:
            measure_over_interval(img_p, f, grid)
        times.append(time.time() - s)
        avg_time = np.mean(times[-15:])
        print('Time left: {:.2f}min'.format(avg_time * (len(all_img_ps) - i) /
                                            60))