예제 #1
0
def get_conf(name : str) -> Union[str, int, bool]:
    check_init()
    if name in _config:
        return _config[name]
    if name in C.DEFAULT_CONFIG:
        log.warning("hydownloader", f'Configuration key not found in user config, default value was used: {name}')
        return C.DEFAULT_CONFIG[name]
    log.fatal("hydownloader", f'Invalid configuration key: {name}')
예제 #2
0
def check_db_version() -> None:
    c = get_conn().cursor()
    c.execute('select version from version')
    v = c.fetchall()
    if len(v) != 1:
        log.fatal("hydownloader", "Invalid version table in hydownloader database")
    if v[0]['version'] != __version__:
        log.fatal("hydownloader", "Unsupported hydownloader database version found")
예제 #3
0
def subscription_data_to_url(downloader: str, keywords: str) -> str:
    """
    This function takes a hydownloader downloader name (not the same as a gallery-dl downloader name!)
    and some keywords and generates a (usually gallery) URL for gallery-dl to download.
    In Hydrus terms, this does the same thing as a GUG (gallery URL generator).
    """
    if downloader == "gelbooru":
        return f"https://gelbooru.com/index.php?page=post&s=list&tags={keywords}"
    if downloader == "pixivuser":
        return f"https://www.pixiv.net/en/users/{keywords}"
    if downloader == "pixivranking":
        return f"https://www.pixiv.net/ranking.php?mode={keywords}"
    if downloader == "pixivtagsearch":
        return f"https://www.pixiv.net/en/tags/{keywords}/artworks?s_mode=s_tag"
    if downloader == "raw":
        return keywords
    if downloader == "nijieuser":
        return f"https://nijie.info/members.php?id={keywords}"
    if downloader == "lolibooru":
        return f"https://lolibooru.moe/post?tags={keywords}"
    if downloader == "patreon":
        return f"https://www.patreon.com/{keywords}/posts"
    if downloader == "danbooru":
        return f"https://danbooru.donmai.us/posts?tags={keywords}"
    if downloader == "3dbooru":
        return f"http://behoimi.org/post/index?tags={keywords}"
    if downloader == "sankaku":
        return f"https://chan.sankakucomplex.com/?tags={keywords}&commit=Search"
    if downloader == "artstationuser":
        return f"https://www.artstation.com/{keywords}"
    if downloader == "idolcomplex":
        return f"https://idol.sankakucomplex.com/?tags={keywords}&commit=Search"
    if downloader == "twitter":
        return f"https://twitter.com/{keywords}"
    if downloader == "tumblr":
        return f"https://{keywords}.tumblr.com"
    if downloader == "deviantartuser":
        return f"https://deviantart.com/{keywords}"
    if downloader == "fanbox":
        return f"https://{keywords}.fanbox.cc"
    if downloader == "fantia":
        return f"https://fantia.jp/fanclubs/{keywords}"
    if downloader == "webtoons":
        return f"https://webtoons.com/{keywords}"
    if downloader == "kemonoparty":
        return f"https://kemono.party/{keywords}"
    if downloader == "baraag":
        return f"https://baraag.net/@{keywords}"
    if downloader == "hentaifoundry":
        return f"https://www.hentai-foundry.com/user/{keywords}/profile"
    if downloader == "yandere":
        return f"https://yande.re/post?tags={keywords}"

    log.fatal("hydownloader", f"Invalid downloader: {downloader}")
예제 #4
0
def update_anchor(path: str, hydrus_master_db: str, sites: str,
                  unrecognized_urls_file: Optional[str],
                  recognized_urls_file: Optional[str]) -> None:
    """
    This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to
    generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs
    from the gallery-dl default, these are set in gallery-dl-config.json.
    """
    log.init(path, True)
    db.init(path)
    if not os.path.isfile(hydrus_master_db):
        log.fatal("hydownloader-anchor-exporter",
                  "The given client.master.db file does not exist!")
    hydrus_db = sqlite3.connect(hydrus_master_db)
    hydrus_db.row_factory = sqlite3.Row
    anchor_init_needed = not os.path.isfile(path + "/anchor.db")
    anchor_db = sqlite3.connect(path + "/anchor.db")
    hc = hydrus_db.cursor()
    ac = anchor_db.cursor()
    if anchor_init_needed:
        ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID')
        anchor_db.commit()
    ac.execute('select * from archive')
    known_anchors = {row[0] for row in ac.fetchall()}
    log.info("hydownloader-anchor-exporter",
             "Querying Hydrus database for URLs...")
    hc.execute('select * from url_domains natural inner join urls')
    rows = hc.fetchall()
    all_rows = len(rows)
    processed = 0
    suspicious_urls = set()
    recognized_urls = set()

    sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = {
        'pixiv': (["pixi"], []),
        'gelbooru': (["gelbooru"], []),
        'nijie': (["nijie"], []),
        'lolibooru': (['lolibooru'], []),
        'danbooru': (['danbooru'], []),
        '3dbooru': (['behoimi'], []),
        'sankaku': (['sankaku'], ["idol."]),
        'idolcomplex': (["idol.sankaku"], []),
        'artstation': (["artstation"], []),
        'twitter': (["twitter", "nitter"], []),
        'deviantart': (['deviantart'], []),
        'tumblr': (["tumblr"], [])
    }

    siteset = {x.strip() for x in sites.split(',') if x.strip()}
    if sites == "all":
        siteset = set(sites_to_keywords.keys())
    anchors: Counter[str] = collections.Counter()

    for site in siteset:
        if not site in sites_to_keywords:
            log.fatal('hydownloader-anchor-exporter',
                      f'Unsupported site: {site}')

    def process_url(url):
        patterns = urls.anchor_patterns_from_url(url)
        if patterns:
            recognized_urls.add(url)
            anchors[patterns[0]] += 1
        else:
            suspicious_urls.add(url)

    log.info("hydownloader-anchor-exporter", "Processing URLs...")
    for row in rows:
        processed += 1
        if processed % 1000 == 0:
            print(f"Processed {processed}/{all_rows} URLs")
        for site in siteset:
            accepts, rejects = sites_to_keywords[site]
            url_ok = False
            for accept in accepts:
                if accept in row['url']:
                    url_ok = True
                    break
            if url_ok:
                for reject in rejects:
                    if reject in row['url']: url_ok = False
            if url_ok:
                process_url(row['url'])
    log.info("hydownloader-anchor-exporter", "Done processing URLs")

    if unrecognized_urls_file:
        log.info("hydownloader-anchor-exporter",
                 "Writing unrecognized URLs...")
        with open(unrecognized_urls_file, 'w') as f:
            for url in sorted(suspicious_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing unrecognized URLs")
    if recognized_urls_file:
        log.info("hydownloader-anchor-exporter", "Writing recognized URLs...")
        with open(recognized_urls_file, 'w') as f:
            for url in sorted(recognized_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing recognized URLs")

    log.info("hydownloader-anchor-exporter", "Inserting new anchors...")
    anchor_count = len(anchors.keys())
    processed = 0
    new_anchor_rows = 0
    for anchor in anchors:
        processed += 1
        if processed % 50 == 0:
            print(f"Inserting new anchors {processed}/{anchor_count}")
        final_anchors = [anchor]
        if anchor.startswith("nijie"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("twitter") or anchor.startswith("tumblr"):
            for i in range(anchors[anchor] + 1):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("pixiv"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_p{:02d}".format(i))
        for f_a in final_anchors:
            if f_a in known_anchors:
                continue
            ac.execute('insert into archive(entry) values (?)', (f_a, ))
            new_anchor_rows += 1
    log.info(
        "hydownloader-anchor-exporter",
        f"Done inserting new anchors, added {new_anchor_rows} entries in total"
    )

    anchor_db.commit()
    anchor_db.close()
    hydrus_db.close()
def update_anchor(path: str, hydrus_db_folder: str, sites: str,
                  unrecognized_urls_file: Optional[str],
                  recognized_urls_file: Optional[str], fill_known_urls: bool,
                  keep_old_hydrus_url_data: bool) -> None:
    """
    This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to
    generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs
    from the gallery-dl default, these are set in gallery-dl-config.json.
    If enabled, also fills up the known_urls table in the hydownloader DB with all URLs known by Hydrus.
    """
    log.init(path, True)
    db.init(path)
    if not os.path.isfile(hydrus_db_folder + "/client.master.db"):
        log.fatal(
            "hydownloader-anchor-exporter",
            "The client.master.db database was not found at the given location!"
        )
    hydrus_db = sqlite3.connect("file:" + hydrus_db_folder +
                                "/client.master.db?mode=ro",
                                uri=True)
    hydrus_db.row_factory = sqlite3.Row
    anchor_init_needed = not os.path.isfile(path + "/anchor.db")
    anchor_db = sqlite3.connect(path + "/anchor.db")
    hc = hydrus_db.cursor()
    ac = anchor_db.cursor()
    if anchor_init_needed:
        ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID')
        anchor_db.commit()
    ac.execute('select * from archive')
    known_anchors = {row[0] for row in ac.fetchall()}
    log.info("hydownloader-anchor-exporter",
             "Querying Hydrus database for URLs...")
    hc.execute('select * from url_domains natural inner join urls')
    rows = hc.fetchall()
    all_rows = len(rows)
    processed = 0
    suspicious_urls = set()
    recognized_urls = set()
    current_url_ids = set()
    deleted_url_ids = set()
    if fill_known_urls:
        if not os.path.isfile(hydrus_db_folder + "/client.db"):
            log.fatal(
                "hydownloader-anchor-exporter",
                "The client.db database was not found at the given location!")
        client_db = sqlite3.connect("file:" + hydrus_db_folder +
                                    "/client.db?mode=ro",
                                    uri=True)
        client_db.row_factory = sqlite3.Row
        cc = client_db.cursor()
        log.info("hydownloader-anchor-exporter",
                 "Querying Hydrus database for current URL IDs...")
        cc.execute('select * from current_files natural inner join url_map')
        for row in cc.fetchall():
            current_url_ids.add(row['url_id'])
        log.info("hydownloader-anchor-exporter",
                 "Querying Hydrus database for deleted URL IDs...")
        cc.execute('select * from deleted_files natural inner join url_map')
        for row in cc.fetchall():
            deleted_url_ids.add(row['url_id'])
        client_db.close()
        if keep_old_hydrus_url_data:
            log.info(
                "hydownloader-anchor-exporter",
                "Old Hydrus URL data will NOT be deleted from the shared hydownloader database"
            )
        else:
            log.info(
                "hydownloader-anchor-exporter",
                "Deleting old Hydrus URL data from shared hydownloader database..."
            )
            db.delete_all_hydrus_known_urls()

    sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = {
        'pixiv': (["pixi"], []),
        'gelbooru': (["gelbooru"], []),
        'nijie': (["nijie"], []),
        'lolibooru': (['lolibooru'], []),
        'danbooru': (['danbooru'], []),
        '3dbooru': (['behoimi'], []),
        'sankaku': (['sankaku'], ["idol."]),
        'idolcomplex': (["idol.sankaku"], []),
        'artstation': (["artstation"], []),
        'twitter': (["twitter", "nitter"], []),
        'deviantart': (['deviantart'], []),
        'tumblr': (["tumblr"], []),
        'hentaifoundry': (["hentai-foundry"], []),
        'yandere': (["yande.re"], [])
    }

    siteset = {x.strip() for x in sites.split(',') if x.strip()}
    if sites == "all":
        siteset = set(sites_to_keywords.keys())
    anchors: Counter[str] = collections.Counter()

    for site in siteset:
        if not site in sites_to_keywords:
            log.fatal('hydownloader-anchor-exporter',
                      f'Unsupported site: {site}')

    def process_url(url):
        patterns = urls.anchor_patterns_from_url(url)
        if patterns:
            recognized_urls.add(url)
            anchors[patterns[0]] += 1
        else:
            suspicious_urls.add(url)

    log.info("hydownloader-anchor-exporter", "Processing URLs...")
    for row in rows:
        processed += 1
        if processed % 1000 == 0:
            print(f"Processed {processed}/{all_rows} URLs", file=sys.stderr)
        if fill_known_urls:
            known_url_status = 1
            is_current = row['url_id'] in current_url_ids
            is_deleted = row['url_id'] in deleted_url_ids
            if is_current and is_deleted:
                known_url_status = 4
            elif is_deleted:
                known_url_status = 3
            elif is_current:
                known_url_status = 2
            db.add_hydrus_known_url(row['url'], known_url_status)
        for site in siteset:
            accepts, rejects = sites_to_keywords[site]
            url_ok = False
            for accept in accepts:
                if accept in row['url']:
                    url_ok = True
                    break
            if url_ok:
                for reject in rejects:
                    if reject in row['url']: url_ok = False
            if url_ok:
                process_url(row['url'])
    log.info("hydownloader-anchor-exporter", "Done processing URLs")

    if unrecognized_urls_file:
        log.info("hydownloader-anchor-exporter",
                 "Writing unrecognized URLs...")
        with open(unrecognized_urls_file, 'w', encoding='utf-8') as f:
            for url in sorted(suspicious_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing unrecognized URLs")
    if recognized_urls_file:
        log.info("hydownloader-anchor-exporter", "Writing recognized URLs...")
        with open(recognized_urls_file, 'w', encoding='utf-8') as f:
            for url in sorted(recognized_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing recognized URLs")

    log.info("hydownloader-anchor-exporter", "Inserting new anchors...")
    anchor_count = len(anchors.keys())
    processed = 0
    new_anchor_rows = 0
    for anchor in anchors:
        processed += 1
        if processed % 50 == 0:
            print(f"Inserting new anchors {processed}/{anchor_count}",
                  file=sys.stderr)
        final_anchors = [anchor]
        if anchor.startswith("nijie"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("twitter") or anchor.startswith("tumblr"):
            for i in range(anchors[anchor] + 1):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("pixiv"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_p{:02d}".format(i))
        for f_a in final_anchors:
            if f_a in known_anchors:
                continue
            ac.execute('insert into archive(entry) values (?)', (f_a, ))
            new_anchor_rows += 1
    log.info(
        "hydownloader-anchor-exporter",
        f"Done inserting new anchors, added {new_anchor_rows} entries in total"
    )

    anchor_db.commit()
    anchor_db.close()
    hydrus_db.close()
    db.shutdown()
예제 #6
0
def subscription_worker() -> None:
    global _sub_worker_ended_flag
    try:
        log.info("hydownloader", "Starting subscription worker thread...")
        with _worker_lock:
            _sub_worker_ended_flag = False
        while True:
            time.sleep(2)
            with _worker_lock:
                if _end_threads_flag:
                    break
            subs_due = db.get_due_subscriptions()
            if not subs_due:
                with _worker_lock:
                    if _sub_worker_paused_flag:
                        set_subscription_worker_status("paused")
                    else:
                        set_subscription_worker_status("nothing to do: checked for due subscriptions, found none")
            sub = subs_due[0] if subs_due else None
            while sub:
                with _worker_lock:
                    if _end_threads_flag:
                        break
                    if _sub_worker_paused_flag:
                        set_subscription_worker_status("paused")
                        break
                initial_check = sub['last_check'] is None
                url = urls.subscription_data_to_url(sub['downloader'], sub['keywords'])
                check_started_time = time.time()
                status_msg = f"checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']})"
                set_subscription_worker_status(status_msg)
                log.info(f"subscription-{sub['id']}", status_msg.capitalize())
                if initial_check:
                    log.info(f"subscription-{sub['id']}", "This is the first check for this subscription")
                result = gallery_dl_utils.run_gallery_dl(
                    url=url,
                    ignore_anchor=False,
                    metadata_only=False,
                    log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-latest.txt",
                    old_log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-old.txt",
                    console_output_file=db.get_rootpath()+f"/temp/subscription-{sub['id']}-gallery-dl-output.txt",
                    unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-latest.txt",
                    old_unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-old.txt",
                    overwrite_existing=False,
                    filter_=sub['filter'],
                    chapter_filter=None,
                    subscription_mode=True,
                    abort_after=sub['abort_after'],
                    max_file_count = sub['max_files_initial'] if initial_check else sub['max_files_regular']
                    )
                if result:
                    log.warning(f"subscription-{sub['id']}", "Error: "+result)
                else:
                    sub['last_successful_check'] = check_started_time
                sub['last_check'] = check_started_time
                new_files, skipped_files = process_additional_data(subscription_id = sub['id'])
                check_ended_time = time.time()
                db.add_subscription_check(sub['id'], new_files=new_files, already_seen_files=skipped_files, time_started=check_started_time, time_finished=check_ended_time, status=result)
                db.add_or_update_subscriptions([sub])
                status_msg = f"finished checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']}), new files: {new_files}, skipped: {skipped_files}"
                set_subscription_worker_status(status_msg)
                log.info(f"subscription-{sub['id']}", status_msg.capitalize())
                subs_due = db.get_due_subscriptions()
                sub = subs_due[0] if subs_due else None
            with _worker_lock:
                if _end_threads_flag:
                    break
        with _worker_lock:
            if _end_threads_flag:
                log.info("hydownloader", "Stopping subscription worker thread")
                _sub_worker_ended_flag = True
    except Exception as e:
        log.fatal("hydownloader", "Uncaught exception in subscription worker thread", e)
        shutdown()
예제 #7
0
def url_queue_worker() -> None:
    global _url_worker_ended_flag
    try:
        log.info("hydownloader", "Starting single URL queue worker thread...")
        with _worker_lock:
            _url_worker_ended_flag = False
        while True:
            time.sleep(2)
            with _worker_lock:
                if _end_threads_flag:
                    break
            urls_to_dl = db.get_urls_to_download()
            if not urls_to_dl:
                with _worker_lock:
                    if _url_worker_paused_flag:
                        set_url_worker_status("paused")
                    else:
                        set_url_worker_status("nothing to do: checked for queued URLs, found none")
            urlinfo = urls_to_dl[0] if urls_to_dl else None
            while urlinfo:
                with _worker_lock:
                    if _end_threads_flag:
                        break
                    if _url_worker_paused_flag:
                        set_url_worker_status("paused")
                        break
                check_time = time.time()
                status_msg = f"downloading URL: {urlinfo['url']}"
                set_url_worker_status(status_msg)
                log.info("single url downloader", status_msg.capitalize())
                result = gallery_dl_utils.run_gallery_dl(
                    url=urlinfo['url'],
                    ignore_anchor=urlinfo['ignore_anchor'],
                    metadata_only=urlinfo['metadata_only'],
                    log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-latest.txt",
                    old_log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-old.txt",
                    console_output_file=db.get_rootpath()+f"/temp/single-url-{urlinfo['id']}-gallery-dl-output.txt",
                    unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-latest.txt",
                    old_unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-old.txt",
                    overwrite_existing=urlinfo['overwrite_existing'],
                    filter_=urlinfo['filter'],
                    chapter_filter=None,
                    subscription_mode=False,
                    max_file_count = urlinfo['max_files']
                    )
                if result:
                    log.warning("single url downloader", f"Error while downloading {urlinfo['url']}: {result}")
                    urlinfo['status'] = 1
                    urlinfo['status_text'] = result
                else:
                    urlinfo['status'] = 0
                    urlinfo['status_text'] = 'ok'
                urlinfo['time_processed'] = check_time
                new_files, skipped_files = process_additional_data(url_id = urlinfo['id'])
                urlinfo['new_files'] = new_files
                urlinfo['already_seen_files'] = skipped_files
                db.add_or_update_urls([urlinfo])
                status_msg = f"finished checking URL: {urlinfo['url']}, new files: {new_files}, skipped: {skipped_files}"
                set_url_worker_status(status_msg)
                log.info("single url downloader", status_msg.capitalize())
                urls_to_dl = db.get_urls_to_download()
                urlinfo = urls_to_dl[0] if urls_to_dl else None
            with _worker_lock:
                if _end_threads_flag:
                    break
        with _worker_lock:
            if _end_threads_flag:
                log.info("hydownloader", "Stopping single URL queue worker thread")
                _url_worker_ended_flag = True
    except Exception as e:
        log.fatal("hydownloader", "Uncaught exception in URL worker thread", e)
        shutdown()
예제 #8
0
def check_init() -> None:
    if not _inited:
        log.fatal("hydownloader", "Database used but not initalized")
예제 #9
0
def run_job(path: str, job: str, config: Optional[str], verbose: bool,
            do_it: bool, no_stop_on_missing_metadata: bool) -> None:
    log.init(path, True)
    db.init(path)

    config_path = db.get_rootpath() + '/hydownloader-import-jobs.json'
    data_path = db.get_datapath()
    if config:
        config_path = config
    if not os.path.isfile(config_path):
        log.fatal("hydownloader-importer",
                  f"Configuration file not found: {config_path}")

    jobs = json.load(open(config_path, 'r', encoding='utf-8-sig'))
    if not job in jobs:
        log.fatal("hydownloader-importer",
                  f"Job not found in configuration file: {job}")
    jd = jobs[job]

    force_add_metadata = jd.get('forceAddMetadata', True)
    force_add_files = jd.get('forceAddFiles', False)

    client = hydrus.Client(jd['apiKey'], jd['apiURL'])

    log.info("hydownloader-importer", f"Starting import job: {job}")

    # iterate over all files in the data directory
    for root, dirs, files in os.walk(data_path):
        for fname in files:
            # json files hold metadata, don't import them to Hydrus
            if fname.endswith('.json'):
                continue

            # set up some variables
            # some will be used later in the code, some are meant to be used in user-defined expressions
            abspath = root + "/" + fname
            path = os.path.relpath(abspath, start=data_path)
            split_path = os.path.split(path)
            fname_noext, fname_ext = os.path.splitext(fname)
            if fname_ext.startswith('.'): fname_ext = fname_ext[1:]

            # find the path of the associated json metadata file, check if it exists
            # for pixiv ugoira, the same metadata file belongs both to the .webm and the .zip,
            # so this needs special handling
            json_path = abspath + '.json'
            if not os.path.isfile(json_path) and abspath.endswith('.webm'):
                json_path = abspath[:-4] + "zip.json"
            json_exists = True
            if not os.path.isfile(json_path):
                json_exists = False
                printerr(f"Warning: no metadata file found for {path}")
                if not no_stop_on_missing_metadata:
                    sys.exit(1)

            generated_urls = set()
            generated_tags: set[tuple[str, str]] = set()
            matched = False  # will be true if at least 1 filter group matched the file
            json_data = None  # this will hold the associated json metadata (if available)

            if verbose: printerr(f"Processing file: {path}...")

            # iterate over all filter groups, do they match this file?
            for group in jd['groups']:
                # evaluate filter, load json metadata if the filter matches and we haven't loaded it yet
                should_process = False
                try:
                    should_process = eval(group['filter'])
                except:
                    printerr(f"Failed to evaluate filter: {group['filter']}")
                    sys.exit(1)
                if not json_data and json_exists:
                    try:
                        json_data = json.load(
                            open(json_path, encoding='utf-8-sig'))
                    except json.decoder.JSONDecodeError:
                        printerr(f"Failed to parse JSON: {json_path}")
                        sys.exit(1)
                if not should_process:
                    continue
                matched = True

                # get the data for this file from the additional_data db table and process it
                # set up some variables that user-defined expressions will be able to use
                additional_data_dicts = db.get_additional_data_for_file(path)
                if not additional_data_dicts and path.endswith('.webm'):
                    additional_data_dicts = db.get_additional_data_for_file(
                        path[:-4] + "zip")
                extra_tags: defaultdict[str, list[str]] = defaultdict(list)
                min_time_added = -1
                max_time_added = -1
                for d in additional_data_dicts:
                    parse_additional_data(extra_tags, d['data'])
                    if min_time_added == -1 or min_time_added > d['time_added']:
                        min_time_added = d['time_added']
                    if max_time_added == -1 or max_time_added < d['time_added']:
                        max_time_added = d['time_added']
                sub_ids = []
                url_ids = []
                for d in additional_data_dicts:
                    if d['subscription_id']:
                        sub_ids.append(str(d['subscription_id']))
                    if d['url_id']:
                        url_ids.append(str(d['url_id']))

                # execute user-defined tag and url generator expressions
                has_error = False
                for dtype, d in [('tag', x) for x in group.get('tags', [])] + [
                    ('url', x) for x in group.get('urls', [])
                ]:
                    skip_on_error = d.get("skipOnError", False)
                    allow_empty = d.get("allowEmpty", False)
                    rule_name = d.get("name")
                    generated_results = []
                    # if the expression is a single string
                    if isinstance(d["values"], str):
                        try:
                            eval_res = eval(d["values"])
                            # check result type: must be string or iterable of strings
                            if isinstance(eval_res, str):
                                generated_results = [eval_res]
                            else:
                                for eval_res_str in eval_res:
                                    if not isinstance(eval_res_str, str):
                                        printerr(
                                            f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {d['values']}"
                                        )
                                        sys.exit(1)
                                    else:
                                        generated_results.append(eval_res_str)
                        except Exception as e:
                            if verbose:
                                printerr(
                                    f"Failed to evaluate expression: {d['values']}"
                                )
                                print(e)
                            has_error = True
                    else:  # multiple expressions (array of strings)
                        for eval_expr in d["values"]:
                            try:
                                eval_res = eval(eval_expr)
                                # check result type: must be string or iterable of strings
                                if isinstance(eval_res, str):
                                    generated_results = [eval_res]
                                else:
                                    for eval_res_str in eval_res:
                                        if not isinstance(eval_res_str, str):
                                            printerr(
                                                f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {eval_expr}"
                                            )
                                            sys.exit(1)
                                        else:
                                            generated_results.append(
                                                eval_res_str)
                            except Exception as e:
                                if verbose:
                                    printerr(
                                        f"Failed to evaluate expression: {eval_expr}"
                                    )
                                    printerr(e)
                                has_error = True

                    # check for empty results or failed evaluation, as necessary
                    if not generated_results and not allow_empty:
                        printerr(
                            f"Error: the rule named {rule_name} yielded no results but this is not allowed"
                        )
                        sys.exit(1)
                    if has_error:
                        printerr(
                            f"Warning: an expression failed to evaluate in the rule named {rule_name}"
                        )
                        if not skip_on_error:
                            sys.exit(1)

                    # save results of the currently evaluated expressions
                    if dtype == 'url':
                        generated_urls.update(generated_results)
                    else:
                        for repo in d["tagRepos"]:
                            generated_tags.update(
                                (repo, tag) for tag in generated_results)
            if matched:
                printerr(f"File matched: {path}...")

                if not os.path.getsize(abspath):
                    print(f"Found truncated file: {abspath}")
                    sys.exit(1)

                if verbose:
                    printerr("Generated URLs:")
                    for url in generated_urls:
                        printerr(url)
                    printerr("Generated tags:")
                    for repo, tag in sorted(list(generated_tags),
                                            key=lambda x: x[0]):
                        printerr(f"{repo} <- {tag}")
                if verbose: printerr('Hashing...')

                # calculate hash, check if Hydrus already knows the file
                already_added = False
                if do_it:
                    hasher = hashlib.sha256()
                    with open(abspath, 'rb') as hashedfile:
                        buf = hashedfile.read(65536 * 16)
                        while len(buf) > 0:
                            hasher.update(buf)
                            buf = hashedfile.read(65536 * 16)
                    hexdigest = hasher.hexdigest()
                    if client.file_metadata(hashes=[hexdigest],
                                            only_identifiers=True):
                        printerr("File is already in Hydrus")
                        already_added = True

                # send file, tags, metadata to Hydrus as needed
                if not already_added or force_add_files:
                    if verbose: printerr("Sending file to Hydrus...")
                    if do_it: client.add_file(abspath)
                if not already_added or force_add_metadata:
                    if verbose: printerr("Associating URLs...")
                    if do_it:
                        client.associate_url(hashes=[hexdigest],
                                             add=generated_urls)
                    if verbose: printerr("Adding tags...")
                    tag_dict = defaultdict(list)
                    for repo, tag in generated_tags:
                        tag_dict[repo].append(tag)
                    if do_it:
                        client.add_tags(hashes=[hexdigest],
                                        service_to_tags=tag_dict)
            else:
                if verbose:
                    printerr(f"Skipping due to no matching filter: {path}")

    log.info("hydownloader-importer", f"Finished import job: {job}")
    db.shutdown()