示例#1
0
def delete_subscriptions(sub_ids: list[int]) -> bool:
    check_init()
    c = get_conn().cursor()
    for i in sub_ids:
        c.execute('delete from subscriptions where id = ?', (i,))
    get_conn().commit()
    log.info("hydownloader", f"Deleted subscriptions with IDs: {', '.join(map(str, sub_ids))}")
    return True
示例#2
0
def delete_urls(url_ids: list[int]) -> bool:
    check_init()
    c = get_conn().cursor()
    for i in url_ids:
        c.execute('delete from single_url_queue where id = ?', (i,))
    get_conn().commit()
    log.info("hydownloader", f"Deleted URLs with IDs: {', '.join(map(str, url_ids))}")
    return True
示例#3
0
 def print_url_entries(entries: list[dict]) -> None:
     for url in entries:
         log.info('hydownloader-report', (
             f"URL: {url['url']}, "
             f"status: {url['status_text']} (code: {url['status']}), "
             f"time added: {format_date(url['time_added'])}, "
             f"time processed: {format_date(url['time_processed'])}, "
             f"paused: {url['paused']}"
         ))
示例#4
0
def check_results_of_post_url(data: dict, sitename: str) -> bool:
    """
    Downloads a URL with gallery-dl, then checks if the
    downloaded filenames, file content and anchor entries match what was provided by the caller.
    """
    url = data['url']
    filenames = data['filenames']
    anchors = data['anchors']
    log.info("hydownloader-test", f'Testing downloading of posts for site {sitename}')
    log_file = db.get_rootpath()+f"/logs/test-site-{sitename}-gallery-dl.txt"
    result_txt = gallery_dl_utils.run_gallery_dl(
        url=url,
        ignore_anchor=False,
        metadata_only=False,
        log_file=log_file,
        console_output_file=db.get_rootpath()+f"/test/test-site-{sitename}-gallery-dl-output.txt",
        unsupported_urls_file=db.get_rootpath()+f"/test/test-site-{sitename}-unsupported-urls-gallery-dl.txt",
        overwrite_existing=False,
        subscription_mode=False,
        test_mode = True
    )
    result = True
    if result_txt:
        log.error("hydownloader-test", f"Error returned for {sitename} download: {result_txt}")
        result = False
    else:
        log.info("hydownloader-test", f"Return code for {sitename} download OK")
    for fname in filenames:
        abs_fname = db.get_rootpath()+"/test/data/gallery-dl/"+fname
        if not os.path.isfile(abs_fname):
            log.error("hydownloader-test", f"Missing expected file: {fname}")
            result = False
        else:
            log.info("hydownloader-test", f"Found expected file: {fname}")
            for content in filenames[fname]:
                with open(abs_fname) as f:
                    if re.search(content, f.read()):
                        log.info("hydownloader-test", "Expected file content found")
                    else:
                        log.error("hydownloader-test", f"Expected file content ({content}) NOT found")
                        result = False
    conn = sqlite3.connect(db.get_rootpath()+"/test/anchor.db")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    for anchor in anchors:
        try:
            c.execute('select entry from archive where entry = ?', (anchor,))
            if len(c.fetchall()):
                log.info("hydownloader-test", f"Expected anchor {anchor} found in database")
            else:
                log.error("hydownloader-test", f"Expected anchor {anchor} NOT found in database")
                result = False
        except sqlite3.OperationalError as e:
            log.error("hydownloader-test", "Error while trying to query anchor database - download failed?", e)
            result = False
    return result
示例#5
0
 def print_sub_entries(entries: list[dict]) -> None:#keywords,downloader,last_check,last_successful_check, check_interval, paused
     for sub in entries:
         log.info('hydownloader-report', (
             f"Downloader: {sub['downloader']}, "
             f"keywords: {sub['keywords']}, "
             f"last check: {format_date(sub['last_check'])}, "
             f"last successful check: {format_date(sub['last_successful_check'])}, "
             f"check interval: {sub['check_interval']}, "
             f"paused: {sub['paused']}"
         ))
示例#6
0
def add_or_update_subscription_checks(sub_data: list[dict]) -> bool:
    check_init()
    for item in sub_data:
        add = "rowid" not in item
        if add: item["time_created"] = time.time()
        upsert_dict("subscription_checks", item, no_commit = True)
        if add:
            log.info("hydownloader", f"Added subscription check entry: rowid {item['rowid']}")
        else:
            log.info("hydownloader", f"Updated subscription check entry with rowid {item['rowid']}")
    get_conn().commit()
    return True
示例#7
0
文件: db.py 项目: Suika/hydownloader
def add_or_update_urls(url_data: list[dict]) -> bool:
    for item in url_data:
        add = "id" not in item
        if add and not "url" in item: continue
        if add: item["time_added"] = time.time()
        if 'url' in item: item['url'] = uri_normalizer.normalizes(item['url'])
        upsert_dict("single_url_queue", item)
        if add:
            log.info("hydownloader", f"Added URL: {item['url']}")
        else:
            log.info("hydownloader", f"Updated URL with ID {item['id']}")
    return True
示例#8
0
def api_worker(path: str, debug: bool) -> None:
    global _srv
    if db.get_conf('daemon.ssl') and os.path.isfile(path+"/server.pem"):
        log.info("hydownloader", "Starting daemon (with SSL)...")
        _srv = SSLWSGIRefServer(path+"/server.pem", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port'))
        bottle.run(server=_srv, debug=debug)
    else:
        if db.get_conf('daemon.ssl'):
            log.warning("hydownloader", "SSL enabled in config, but no server.pem file found in the db folder, continuing without SSL...")
        log.info("hydownloader", "Starting daemon...")
        _srv = SSLWSGIRefServer("", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port'))
        bottle.run(server=_srv, debug=debug)
示例#9
0
def shutdown() -> None:
    global _shutdown_started
    if _shutdown_started: return
    _shutdown_started = True
    end_threads()
    db.shutdown()
    try:
        log.info("hydownloader", "hydownloader shut down")
    except RuntimeError:
        pass
    sys.stderr.close()
    os._exit(0)
示例#10
0
文件: db.py 项目: Suika/hydownloader
def add_or_update_subscriptions(sub_data: list[dict]) -> bool:
    for item in sub_data:
        add = "id" not in item
        if add and not "keywords" in item: continue
        if add and not "downloader" in item: continue
        if add and not "additional_data" in item: item["additional_data"] = ""
        if add: item["time_created"] = time.time()
        upsert_dict("subscriptions", item)
        if add:
            log.info("hydownloader", f"Added subscription: {item['keywords']} for downloader {item['downloader']}")
        else:
            log.info("hydownloader", f"Updated subscription with ID {item['id']}")
    return True
示例#11
0
def shutdown() -> None:
    global _shutdown_started
    db.close_thread_connections()
    if _shutdown_started: return
    _shutdown_started = True
    end_downloader_threads()
    if _srv:
        _srv.stop()
    db.shutdown()
    try:
        log.info("hydownloader", "hydownloader shut down")
    except RuntimeError:
        pass
    sys.stderr.close()
    os._exit(0)
示例#12
0
def mass_add_urls(path: str, file_: str, additional_data: Optional[str], metadata_only: bool, overwrite_existing: bool, filter_: Optional[str], ignore_anchor: bool, max_files: Optional[int]) -> None:
    log.init(path, True)
    db.init(path)
    for line in open(file_, 'r'):
        line = line.strip()
        if line:
            db.add_or_update_urls([{
                'url': line,
                'time_added': time.time(),
                'additional_data': additional_data,
                'metadata_only': metadata_only,
                'overwrite_existing': overwrite_existing,
                'filter': filter_,
                'ignore_anchor': ignore_anchor,
                'max_files': max_files
                }])
            log.info("hydownloader-tools", f"Added URL: {line}")
示例#13
0
def mass_add_subscriptions(path: str, file_: str, downloader: str, additional_data: Optional[str], paused: bool, filter_: Optional[str], abort_after: int, max_files_initial: Optional[int], max_files_regular: Optional[int]) -> None:
    log.init(path, True)
    db.init(path)
    for line in open(file_, 'r'):
        line = line.strip()
        if line:
            db.add_or_update_subscriptions([{
                'keywords': line,
                'downloader': downloader,
                'time_created': time.time(),
                'additional_data': additional_data,
                'filter': filter_,
                'max_files_initial': max_files_initial,
                'max_files_regular': max_files_regular,
                'abort_after': abort_after,
                'paused': paused
                }])
            log.info("hydownloader-tools", f"Added subscription {line} with downloader {downloader}")
示例#14
0
def init(path : str) -> None:
    global _inited, _path, _config
    _path = path
    if not os.path.isdir(path):
        log.info("hydownloader", f"Initializing new database folder at {path}")
        os.makedirs(path)
    if not os.path.isdir(path + "/logs"):
        os.makedirs(path + "/logs")
    if not os.path.isdir(path + "/logs"):
        os.makedirs(path + "/data")
    if not os.path.isdir(path + "/temp"):
        os.makedirs(path + "/temp")
    needs_db_init = False
    if not os.path.isfile(path+"/hydownloader.db"):
        needs_db_init = True
    if not os.path.isfile(path+"/gallery-dl-config.json"):
        gdl_cfg = open(path+"/gallery-dl-config.json", 'w', encoding='utf-8')
        gdl_cfg.write(C.DEFAULT_GALLERY_DL_CONFIG)
        gdl_cfg.close()
    if not os.path.isfile(path+"/gallery-dl-user-config.json"):
        gdl_cfg = open(path+"/gallery-dl-user-config.json", 'w', encoding='utf-8')
        gdl_cfg.write(C.DEFAULT_GALLERY_DL_USER_CONFIG)
        gdl_cfg.close()
    if not os.path.isfile(path+"/hydownloader-config.json"):
        hydl_cfg = open(path+"/hydownloader-config.json", 'w', encoding='utf-8')
        hydl_cfg.write(json.dumps(C.DEFAULT_CONFIG, indent=4))
        hydl_cfg.close()
    if not os.path.isfile(path+"/hydownloader-import-jobs.json"):
        hydl_cfg = open(path+"/hydownloader-import-jobs.json", 'w', encoding='utf-8')
        hydl_cfg.write(json.dumps(C.DEFAULT_IMPORT_JOBS, indent=4))
        hydl_cfg.close()
    if not os.path.isfile(path+"/cookies.txt"):
        open(path+"/cookies.txt", "w", encoding="utf-8").close()
    get_conn()
    if needs_db_init: create_db()
    _config = json.load(open(path+"/hydownloader-config.json", "r", encoding="utf-8-sig"))

    need_shared_db_init = not os.path.isfile(_shared_db_path())
    get_shared_conn()
    if need_shared_db_init: create_shared_db()

    check_db_version()

    _inited = True
示例#15
0
def mass_add_subscriptions(path: str, file_: str, downloader: str,
                           additional_data: Optional[str], paused: bool,
                           filter_: Optional[str], abort_after: int,
                           max_files_initial: Optional[int],
                           max_files_regular: Optional[int],
                           check_interval: int, random_check_interval: int,
                           encode_keywords: bool) -> None:
    log.init(path, True)
    db.init(path)
    for line in open(file_, 'r', encoding='utf-8-sig'):
        line = line.strip()
        if encode_keywords:
            line = line.replace(' ', '+')
            line = urllib.parse.quote(line, safe='/+').lower()
        if line:
            new_sub = {
                'keywords':
                line,
                'downloader':
                downloader,
                'time_created':
                time.time(),
                'additional_data':
                additional_data,
                'filter':
                filter_,
                'paused':
                paused,
                'check_interval':
                check_interval + random.randint(0, random_check_interval)
            }
            if max_files_initial is not None:
                new_sub['max_files_initial'] = max_files_initial
            if max_files_regular is not None:
                new_sub['max_files_regular'] = max_files_regular
            if abort_after is not None:
                new_sub['abort_after'] = abort_after
            db.add_or_update_subscriptions([new_sub])
            log.info(
                "hydownloader-tools",
                f"Added subscription {line} with downloader {downloader}")
示例#16
0
def start(path : str, debug : bool) -> None:
    log.init(path, debug)
    db.init(path)

    process_additional_data()

    subs_thread = threading.Thread(target=subscription_worker, name='Subscription worker', daemon=True)
    subs_thread.start()

    url_thread = threading.Thread(target=url_queue_worker, name='Single URL queue worker', daemon=True)
    url_thread.start()

    if db.get_conf('daemon.ssl') and os.path.isfile(path+"/server.pem"):
        log.info("hydownloader", "Starting daemon (with SSL)...")
        srv = SSLWSGIRefServer(path+"/server.pem", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port'))
        bottle.run(server=srv, debug=debug)
    else:
        if db.get_conf('daemon.ssl'):
            log.warning("hydownloader", "SSL enabled in config, but no server.pem file found in the db folder, continuing without SSL...")
        log.info("hydownloader", "Starting daemon...")
        srv = SSLWSGIRefServer("", host=db.get_conf('daemon.host'), port=db.get_conf('daemon.port'))
        bottle.run(server=srv, debug=debug)
示例#17
0
文件: db.py 项目: Suika/hydownloader
def init(path : str) -> None:
    global _conn, _inited, _path, _config
    _path = path
    if not os.path.isdir(path):
        log.info("hydownloader", f"Initializing new database folder at {path}")
        os.makedirs(path)
    if not os.path.isdir(path + "/logs"):
        os.makedirs(path + "/logs")
    if not os.path.isdir(path + "/logs"):
        os.makedirs(path + "/data")
    if not os.path.isdir(path + "/temp"):
        os.makedirs(path + "/temp")
    needs_db_init = False
    if not os.path.isfile(path+"/hydownloader.db"):
        needs_db_init = True
    if not os.path.isfile(path+"/gallery-dl-config.json"):
        gdl_cfg = open(path+"/gallery-dl-config.json", 'w')
        gdl_cfg.write(C.DEFAULT_GALLERY_DL_CONFIG)
        gdl_cfg.close()
    if not os.path.isfile(path+"/gallery-dl-user-config.json"):
        gdl_cfg = open(path+"/gallery-dl-user-config.json", 'w')
        gdl_cfg.write(C.DEFAULT_GALLERY_DL_USER_CONFIG)
        gdl_cfg.close()
    if not os.path.isfile(path+"/hydownloader-config.json"):
        hydl_cfg = open(path+"/hydownloader-config.json", 'w')
        hydl_cfg.write(json.dumps(C.DEFAULT_CONFIG, indent=4))
        hydl_cfg.close()
    if not os.path.isfile(path+"/cookies.txt"):
        open(path+"/cookies.txt", "w").close()
    _conn = sqlite3.connect(path+"/hydownloader.db", check_same_thread=False, timeout=24*60*60)
    _conn.row_factory = lambda c, r: dict(zip([col[0] for col in c.description], r))
    if needs_db_init: create_db()
    check_db_version()
    _config = json.load(open(path+"/hydownloader-config.json", "r"))

    _inited = True
示例#18
0
def update_anchor(path: str, hydrus_master_db: str, sites: str,
                  unrecognized_urls_file: Optional[str],
                  recognized_urls_file: Optional[str]) -> None:
    """
    This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to
    generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs
    from the gallery-dl default, these are set in gallery-dl-config.json.
    """
    log.init(path, True)
    db.init(path)
    if not os.path.isfile(hydrus_master_db):
        log.fatal("hydownloader-anchor-exporter",
                  "The given client.master.db file does not exist!")
    hydrus_db = sqlite3.connect(hydrus_master_db)
    hydrus_db.row_factory = sqlite3.Row
    anchor_init_needed = not os.path.isfile(path + "/anchor.db")
    anchor_db = sqlite3.connect(path + "/anchor.db")
    hc = hydrus_db.cursor()
    ac = anchor_db.cursor()
    if anchor_init_needed:
        ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID')
        anchor_db.commit()
    ac.execute('select * from archive')
    known_anchors = {row[0] for row in ac.fetchall()}
    log.info("hydownloader-anchor-exporter",
             "Querying Hydrus database for URLs...")
    hc.execute('select * from url_domains natural inner join urls')
    rows = hc.fetchall()
    all_rows = len(rows)
    processed = 0
    suspicious_urls = set()
    recognized_urls = set()

    sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = {
        'pixiv': (["pixi"], []),
        'gelbooru': (["gelbooru"], []),
        'nijie': (["nijie"], []),
        'lolibooru': (['lolibooru'], []),
        'danbooru': (['danbooru'], []),
        '3dbooru': (['behoimi'], []),
        'sankaku': (['sankaku'], ["idol."]),
        'idolcomplex': (["idol.sankaku"], []),
        'artstation': (["artstation"], []),
        'twitter': (["twitter", "nitter"], []),
        'deviantart': (['deviantart'], []),
        'tumblr': (["tumblr"], [])
    }

    siteset = {x.strip() for x in sites.split(',') if x.strip()}
    if sites == "all":
        siteset = set(sites_to_keywords.keys())
    anchors: Counter[str] = collections.Counter()

    for site in siteset:
        if not site in sites_to_keywords:
            log.fatal('hydownloader-anchor-exporter',
                      f'Unsupported site: {site}')

    def process_url(url):
        patterns = urls.anchor_patterns_from_url(url)
        if patterns:
            recognized_urls.add(url)
            anchors[patterns[0]] += 1
        else:
            suspicious_urls.add(url)

    log.info("hydownloader-anchor-exporter", "Processing URLs...")
    for row in rows:
        processed += 1
        if processed % 1000 == 0:
            print(f"Processed {processed}/{all_rows} URLs")
        for site in siteset:
            accepts, rejects = sites_to_keywords[site]
            url_ok = False
            for accept in accepts:
                if accept in row['url']:
                    url_ok = True
                    break
            if url_ok:
                for reject in rejects:
                    if reject in row['url']: url_ok = False
            if url_ok:
                process_url(row['url'])
    log.info("hydownloader-anchor-exporter", "Done processing URLs")

    if unrecognized_urls_file:
        log.info("hydownloader-anchor-exporter",
                 "Writing unrecognized URLs...")
        with open(unrecognized_urls_file, 'w') as f:
            for url in sorted(suspicious_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing unrecognized URLs")
    if recognized_urls_file:
        log.info("hydownloader-anchor-exporter", "Writing recognized URLs...")
        with open(recognized_urls_file, 'w') as f:
            for url in sorted(recognized_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing recognized URLs")

    log.info("hydownloader-anchor-exporter", "Inserting new anchors...")
    anchor_count = len(anchors.keys())
    processed = 0
    new_anchor_rows = 0
    for anchor in anchors:
        processed += 1
        if processed % 50 == 0:
            print(f"Inserting new anchors {processed}/{anchor_count}")
        final_anchors = [anchor]
        if anchor.startswith("nijie"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("twitter") or anchor.startswith("tumblr"):
            for i in range(anchors[anchor] + 1):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("pixiv"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_p{:02d}".format(i))
        for f_a in final_anchors:
            if f_a in known_anchors:
                continue
            ac.execute('insert into archive(entry) values (?)', (f_a, ))
            new_anchor_rows += 1
    log.info(
        "hydownloader-anchor-exporter",
        f"Done inserting new anchors, added {new_anchor_rows} entries in total"
    )

    anchor_db.commit()
    anchor_db.close()
    hydrus_db.close()
示例#19
0
def clear_test_env() -> None:
    log.info('hydownloader-test', 'Clearing test environment...')
    if os.path.exists(db.get_rootpath() + '/test'):
        shutil.rmtree(db.get_rootpath() + '/test')
    os.makedirs(db.get_rootpath() + "/test")
    log.info('hydownloader-test', 'Test environment cleared')
示例#20
0
def test_internal(sites: str) -> bool:
    post_url_data = {
        'gelbooru': {
            'url':
            "https://gelbooru.com/index.php?page=post&s=view&id=6002236",
            'filenames': {
                "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg.json":
                ["1girl ", "wings", '"rating": "q"', '"tags_general":'],
                "gelbooru/gelbooru_6002236_0ef507cc4c222406da544db3231de323.jpg":
                []
            },
            'anchors': ["gelbooru6002236"]
        },
        'gelbooru_notes': {
            'url':
            "https://gelbooru.com/index.php?page=post&s=view&id=5997331",
            'filenames': {
                "gelbooru/gelbooru_5997331_7726d401af0e6bf5b58809f65d08334e.png.json":
                [
                    '"y": 72', '"x": 35', '"width": 246', '"height": 553',
                    '"body": "Look over this way when you talk~"'
                ]
            },
            'anchors': ["gelbooru5997331"]
        },
        'danbooru': {
            'url': "https://danbooru.donmai.us/posts/4455434",
            'filenames': {
                "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png.json":
                ["1girl ", "tail", '"rating": "q"'],
                "danbooru/danbooru_4455434_e110444217827ef3f82fb33b45e1841f.png":
                []
            },
            'anchors': ["danbooru4455434"]
        },
        'pixiv': {
            'url':
            "https://www.pixiv.net/en/artworks/88865254",
            'filenames': {
                "pixiv/3316400 rogia/88865254_p7.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p6.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p5.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p4.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p3.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p2.jpg.json": [
                    "Fate/GrandOrder", '"title": "メイドロリンチちゃん"', '"tags":',
                    '"tags": ['
                ],
                "pixiv/3316400 rogia/88865254_p1.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p0.jpg.json": [],
                "pixiv/3316400 rogia/88865254_p7.jpg": [],
                "pixiv/3316400 rogia/88865254_p6.jpg": [],
                "pixiv/3316400 rogia/88865254_p5.jpg": [],
                "pixiv/3316400 rogia/88865254_p4.jpg": [],
                "pixiv/3316400 rogia/88865254_p3.jpg": [],
                "pixiv/3316400 rogia/88865254_p2.jpg": [],
                "pixiv/3316400 rogia/88865254_p1.jpg": [],
                "pixiv/3316400 rogia/88865254_p0.jpg": []
            },
            'anchors': [
                "pixiv88865254_p00", "pixiv88865254_p01", "pixiv88865254_p02",
                "pixiv88865254_p03", "pixiv88865254_p04", "pixiv88865254_p05",
                "pixiv88865254_p06", "pixiv88865254_p07"
            ]
        },
        'pixiv_ugoira': {
            'url': "https://www.pixiv.net/en/artworks/88748768",
            'filenames': {
                "pixiv/9313418 thaimay704/88748768_p0.zip": [],
                "pixiv/9313418 thaimay704/88748768_p0.zip.json": [],
                "pixiv/9313418 thaimay704/88748768_p0.webm": []
            },
            'anchors': ["pixiv88748768"]
        },
        'lolibooru': {
            'url':
            'https://lolibooru.moe/post/show/178123/1girl-barefoot-brown_eyes-brown_hair-cameltoe-cove',
            'filenames': {
                "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg.json":
                ["1girl ", " swimsuit", '"rating": "q",'],
                "lolibooru/lolibooru_178123_a77d70e0019fc77c25d0ae563fc9b324.jpg":
                []
            },
            'anchors': ["lolibooru178123"]
        },
        '3dbooru': {
            'url':
            "http://behoimi.org/post/show/648363/apron-black_legwear-collar-cosplay-hairband-immora",
            'filenames': {
                "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg.json":
                ["cosplay ", " maid_uniform", '"rating": "s",'],
                "3dbooru/3dbooru_648363_720f344170696293c3fe2640c59d8f41.jpg":
                []
            },
            'anchors': ["3dbooru648363"]
        },
        'nijie': {
            'url': "https://nijie.info/view.php?id=306993",
            'filenames': {
                "nijie/72870/306993_p0.jpg": [],
                "nijie/72870/306993_p1.jpg": [],
                "nijie/72870/306993_p0.jpg.json": [],
                "nijie/72870/306993_p1.jpg.json":
                ["\"オリジナル\"", "\"title\": \"朝7時50分の通学路\","]
            },
            'anchors': ["nijie306993_0", "nijie306993_1"]
        },
        'patreon': {
            'url': "https://www.patreon.com/posts/new-cg-set-on-48042243",
            'filenames': {
                "patreon/Osiimi Chan/48042243_NEW CG SET on Gumroad!! Ganyu's Hypnotic Rendezvou_01.png":
                []
            },
            'anchors': ["patreon48042243_1"]
        },
        'sankaku': {
            'url': "https://chan.sankakucomplex.com/post/show/707246",
            'filenames': {
                "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg":
                [],
                "sankaku/sankaku_707246_5da41b5136905c35cad9cbcba89836a3.jpg.json":
                ['"kirisame_marisa"', '"3girls"']
            },
            'anchors': ["sankaku707246"]
        },
        'idolcomplex': {
            'url': "https://idol.sankakucomplex.com/post/show/701724",
            'filenames': {
                "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg":
                [],
                "idolcomplex/idolcomplex_701724_92b853bcf8dbff393c6217839013bcab.jpg.json":
                ['"rating": "q",', 'nikumikyo,']
            },
            'anchors': ["idolcomplex701724"]
        },
        'artstation': {
            'url': "https://www.artstation.com/artwork/W2LROD",
            'filenames': {
                "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg":
                [],
                "artstation/sergey_vasnev/artstation_6721469_24728858_Procession.jpg.json":
                ['"title": "Procession",']
            },
            'anchors': ["artstation24728858"]
        },
        'deviantart': {
            'url':
            "https://www.deviantart.com/squchan/art/Atelier-Ryza-820511154",
            'filenames': {
                "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg": [],
                "deviantart/SquChan/deviantart_820511154_Atelier Ryza.jpg.json":
                ['"is_mature": true,']
            },
            'anchors': ["deviantart820511154"]
        },
        'twitter': {
            'url':
            "https://twitter.com/momosuzunene/status/1380033327680266244",
            'filenames': {
                "twitter/momosuzunene/1380033327680266244_1.jpg": [],
                "twitter/momosuzunene/1380033327680266244_1.jpg.json":
                ['"name": "momosuzunene",']
            },
            'anchors': ["twitter1380033327680266244_1"]
        },
        'webtoons': {
            'url':
            "https://www.webtoons.com/en/challenge/crawling-dreams/ep-1-nyarla-ghast/viewer?title_no=141539&episode_no=81",
            'anchors': [
                'webtoons141539_81_1', 'webtoons141539_81_2',
                'webtoons141539_81_3', 'webtoons141539_81_4'
            ],
            'filenames': {
                "webtoons/crawling-dreams/81-01.jpg": [],
                "webtoons/crawling-dreams/81-01.jpg.json":
                ['"comic": "crawling-dreams"']
            }
        },
        'baraag': {
            'url': "https://baraag.net/@pumpkinnsfw/106191173043385531",
            'anchors':
            ['baraag106191139078112401', 'baraag106191139927706653'],
            'filenames': {
                "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png":
                [],
                "mastodon/baraag.net/pumpkinnsfw/baraag_106191173043385531_106191139078112401.png.json":
                ['"sensitive": true']
            }
        },
        'hentaifoundry': {
            'url':
            "https://www.hentai-foundry.com/pictures/user/PalomaP/907277/Rapunzel-loves-creampie",
            'anchors': ["hentaifoundry907277"],
            'filenames': {
                "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg":
                [],
                "hentaifoundry/PalomaP/hentaifoundry_907277_Rapunzel loves creampie.jpg.json":
                ['"tags": [', '"creampie"']
            }
        },
        'yandere': {
            'url': "https://yande.re/post/show/619304",
            'anchors': ["yandere619304"],
            'filenames': {
                'yandere_619304_449a208b7a42f917498a00386e173118.jpg.json': [],
                'yandere_619304_449a208b7a42f917498a00386e173118.jpg':
                ['"tags_artist": "zuima"']
            }
        }
    }

    site_set = {site.strip() for site in sites.split(',')}
    for site in site_set:
        clear_test_env()
        log_file = db.get_rootpath() + f"/logs/test-site-{site}-gallery-dl.txt"
        should_break = False
        if site == 'environment':
            log.info("hydownloader-test", "Querying gallery-dl version")
            version_str = gallery_dl_utils.run_gallery_dl_with_custom_args(
                ['--version'], capture_output=True).stdout.strip()
            try:
                if version_str.endswith("-dev"): version_str = version_str[:-4]
                major, minor, patch = tuple(map(int, version_str.split('.')))
                if major != 1 or minor < 17 or minor == 17 and patch < 4:
                    log.error(
                        'hydownloader-test',
                        f"Bad gallery-dl version: {version_str}, need 1.17.3 or newer"
                    )
                    should_break = True
                else:
                    log.info(
                        'hydownloader-test',
                        f"Found gallery-dl version: {version_str}, this is OK")
            except ValueError as e:
                log.error('hydownloader-test',
                          "Could not recognize gallery-dl version", e)
                should_break = True
            try:
                ff_result = subprocess.run(['ffmpeg', '-version'],
                                           capture_output=True,
                                           text=True,
                                           check=False).stdout.split('\n')[0]
                log.info('hydownloader-test',
                         f"Found ffmpeg version: {ff_result}")
            except FileNotFoundError as e:
                log.error('hydownloader-test', "Could not find ffmpeg", e)
                should_break = True
            try:
                yt_result = subprocess.run(['youtube-dl', '--version'],
                                           capture_output=True,
                                           text=True,
                                           check=False).stdout.strip()
                log.info('hydownloader-test',
                         f"Found youtube-dl version: {yt_result}")
            except FileNotFoundError as e:
                log.error('hydownloader-test', "Could not find youtube-dl", e)
                should_break = True
        elif site == "gelbooru":
            log.info("hydownloader-test", "Testing gelbooru...")

            log.info("hydownloader-test",
                     'Testing search of "sensitive" content')
            sensitive_url = "https://gelbooru.com/index.php?page=post&s=list&tags=loli"
            result = gallery_dl_utils.run_gallery_dl_with_custom_args(
                [
                    sensitive_url, '--get-urls', '-o', 'image-range="1-10"',
                    '--write-log', log_file
                ],
                capture_output=True)
            sensitive_ok = True
            if result.returncode != 0:
                status_txt = gallery_dl_utils.check_return_code(
                    result.returncode)
                log.error(
                    "hydownloader-test",
                    f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}'
                )
                sensitive_ok = False
                should_break = True
            sensitive_results_cnt = len(
                re.findall("https://.*?gelbooru.com/images", result.stdout))
            if sensitive_results_cnt < 10:
                log.error(
                    "hydownloader-test",
                    f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}'
                )
                sensitive_ok = False
                should_break = True
            if sensitive_ok:
                log.info(
                    "hydownloader-test",
                    'Search of "sensitive" content seems to be working OK')

            should_break = not check_results_of_post_url(
                post_url_data['gelbooru'], site) or should_break

            log.info("hydownloader-test", 'Testing note extraction')
            should_break = not check_results_of_post_url(
                post_url_data['gelbooru_notes'], site) or should_break
        elif site == "danbooru":
            log.info("hydownloader-test", "Testing danbooru...")

            log.info("hydownloader-test",
                     'Testing search of "sensitive" content')
            sensitive_url = "https://danbooru.donmai.us/posts?tags=loli"
            result = gallery_dl_utils.run_gallery_dl_with_custom_args(
                [
                    sensitive_url, '--get-urls', '-o', 'image-range="1-10"',
                    '--write-log', log_file
                ],
                capture_output=True)
            sensitive_ok = True
            if result.returncode != 0:
                status_txt = gallery_dl_utils.check_return_code(
                    result.returncode)
                log.error(
                    "hydownloader-test",
                    f'Error returned while trying to download "sensitive" content: return code {result.returncode}, {status_txt}'
                )
                sensitive_ok = False
                should_break = True
            sensitive_results_cnt = len(
                re.findall("https://danbooru.donmai.us/data", result.stdout))
            if sensitive_results_cnt < 10:
                log.error(
                    "hydownloader-test",
                    f'Failed to find "sensitive" content, insufficient number of results: {sensitive_results_cnt}'
                )
                sensitive_ok = False
                should_break = True
            if sensitive_ok:
                log.info(
                    "hydownloader-test",
                    'Search of "sensitive" content seems to be working OK')

            should_break = not check_results_of_post_url(
                post_url_data['danbooru'], site) or should_break
        elif site == "pixiv":
            log.info("hydownloader-test", "Testing pixiv...")
            should_break = not check_results_of_post_url(
                post_url_data['pixiv'], site) or should_break
            log.info("hydownloader-test", 'Testing downloading of ugoira')
            should_break = not check_results_of_post_url(
                post_url_data['pixiv_ugoira'], site) or should_break
        elif site == "lolibooru":
            log.info("hydownloader-test", "Testing lolibooru.moe...")
            should_break = not check_results_of_post_url(
                post_url_data['lolibooru'], site) or should_break
        elif site == "3dbooru":
            log.info("hydownloader-test", "Testing 3dbooru...")
            should_break = not check_results_of_post_url(
                post_url_data['3dbooru'], site) or should_break
        elif site == "patreon":
            log.info("hydownloader-test", "Testing patreon...")
            should_break = not check_results_of_post_url(
                post_url_data['patreon'], site) or should_break
        elif site == "nijie":
            log.info("hydownloader-test", "Testing nijie.info...")
            should_break = not check_results_of_post_url(
                post_url_data['nijie'], site) or should_break
        elif site == "sankaku":
            log.info("hydownloader-test", "Testing sankaku...")
            should_break = not check_results_of_post_url(
                post_url_data['sankaku'], site) or should_break
        elif site == "idolcomplex":
            log.info("hydownloader-test", "Testing idolcomplex...")
            should_break = not check_results_of_post_url(
                post_url_data['idolcomplex'], site) or should_break
        elif site == "artstation":
            log.info("hydownloader-test", "Testing artstation...")
            should_break = not check_results_of_post_url(
                post_url_data['artstation'], site) or should_break
        elif site == "twitter":
            log.info("hydownloader-test", "Testing twitter...")
            should_break = not check_results_of_post_url(
                post_url_data['twitter'], site) or should_break
        elif site == "deviantart":
            log.info("hydownloader-test", "Testing deviantart...")
            should_break = not check_results_of_post_url(
                post_url_data['deviantart'], site) or should_break
        elif site == "webtoons":
            log.info("hydownloader-test", "Testing webtoons...")
            should_break = not check_results_of_post_url(
                post_url_data['webtoons'], site) or should_break
        elif site == "baraag":
            log.info("hydownloader-test", "Testing baraag...")
            should_break = not check_results_of_post_url(
                post_url_data['baraag'], site) or should_break
        elif site == "hentaifoundry":
            log.info("hydownloader-test", "Testing hentaifoundry...")
            should_break = not check_results_of_post_url(
                post_url_data['hentaifoundry'], site) or should_break
        elif site == "yandere":
            log.info("hydownloader-test", "Testing yande.re...")
            should_break = not check_results_of_post_url(
                post_url_data['yandere'], site) or should_break
        else:
            log.error("hydownloader-test",
                      f"Site name not recognized: {site}, no testing done")
            return False
        if should_break:
            log.error(
                "hydownloader-test",
                f"Stopping early due to errors while testing {site}, test environment kept for inspection"
            )
            return False
        clear_test_env()
    return True
def update_anchor(path: str, hydrus_db_folder: str, sites: str,
                  unrecognized_urls_file: Optional[str],
                  recognized_urls_file: Optional[str], fill_known_urls: bool,
                  keep_old_hydrus_url_data: bool) -> None:
    """
    This function goes through all URLs in a Hydrus database, and tries to match them to known site-specific URL patterns to
    generate anchor database entries that gallery-dl can recognize. For some sites, the anchor format differs
    from the gallery-dl default, these are set in gallery-dl-config.json.
    If enabled, also fills up the known_urls table in the hydownloader DB with all URLs known by Hydrus.
    """
    log.init(path, True)
    db.init(path)
    if not os.path.isfile(hydrus_db_folder + "/client.master.db"):
        log.fatal(
            "hydownloader-anchor-exporter",
            "The client.master.db database was not found at the given location!"
        )
    hydrus_db = sqlite3.connect("file:" + hydrus_db_folder +
                                "/client.master.db?mode=ro",
                                uri=True)
    hydrus_db.row_factory = sqlite3.Row
    anchor_init_needed = not os.path.isfile(path + "/anchor.db")
    anchor_db = sqlite3.connect(path + "/anchor.db")
    hc = hydrus_db.cursor()
    ac = anchor_db.cursor()
    if anchor_init_needed:
        ac.execute('CREATE TABLE archive (entry PRIMARY KEY) WITHOUT ROWID')
        anchor_db.commit()
    ac.execute('select * from archive')
    known_anchors = {row[0] for row in ac.fetchall()}
    log.info("hydownloader-anchor-exporter",
             "Querying Hydrus database for URLs...")
    hc.execute('select * from url_domains natural inner join urls')
    rows = hc.fetchall()
    all_rows = len(rows)
    processed = 0
    suspicious_urls = set()
    recognized_urls = set()
    current_url_ids = set()
    deleted_url_ids = set()
    if fill_known_urls:
        if not os.path.isfile(hydrus_db_folder + "/client.db"):
            log.fatal(
                "hydownloader-anchor-exporter",
                "The client.db database was not found at the given location!")
        client_db = sqlite3.connect("file:" + hydrus_db_folder +
                                    "/client.db?mode=ro",
                                    uri=True)
        client_db.row_factory = sqlite3.Row
        cc = client_db.cursor()
        log.info("hydownloader-anchor-exporter",
                 "Querying Hydrus database for current URL IDs...")
        cc.execute('select * from current_files natural inner join url_map')
        for row in cc.fetchall():
            current_url_ids.add(row['url_id'])
        log.info("hydownloader-anchor-exporter",
                 "Querying Hydrus database for deleted URL IDs...")
        cc.execute('select * from deleted_files natural inner join url_map')
        for row in cc.fetchall():
            deleted_url_ids.add(row['url_id'])
        client_db.close()
        if keep_old_hydrus_url_data:
            log.info(
                "hydownloader-anchor-exporter",
                "Old Hydrus URL data will NOT be deleted from the shared hydownloader database"
            )
        else:
            log.info(
                "hydownloader-anchor-exporter",
                "Deleting old Hydrus URL data from shared hydownloader database..."
            )
            db.delete_all_hydrus_known_urls()

    sites_to_keywords: dict[str, Tuple[list[str], list[str]]] = {
        'pixiv': (["pixi"], []),
        'gelbooru': (["gelbooru"], []),
        'nijie': (["nijie"], []),
        'lolibooru': (['lolibooru'], []),
        'danbooru': (['danbooru'], []),
        '3dbooru': (['behoimi'], []),
        'sankaku': (['sankaku'], ["idol."]),
        'idolcomplex': (["idol.sankaku"], []),
        'artstation': (["artstation"], []),
        'twitter': (["twitter", "nitter"], []),
        'deviantart': (['deviantart'], []),
        'tumblr': (["tumblr"], []),
        'hentaifoundry': (["hentai-foundry"], []),
        'yandere': (["yande.re"], [])
    }

    siteset = {x.strip() for x in sites.split(',') if x.strip()}
    if sites == "all":
        siteset = set(sites_to_keywords.keys())
    anchors: Counter[str] = collections.Counter()

    for site in siteset:
        if not site in sites_to_keywords:
            log.fatal('hydownloader-anchor-exporter',
                      f'Unsupported site: {site}')

    def process_url(url):
        patterns = urls.anchor_patterns_from_url(url)
        if patterns:
            recognized_urls.add(url)
            anchors[patterns[0]] += 1
        else:
            suspicious_urls.add(url)

    log.info("hydownloader-anchor-exporter", "Processing URLs...")
    for row in rows:
        processed += 1
        if processed % 1000 == 0:
            print(f"Processed {processed}/{all_rows} URLs", file=sys.stderr)
        if fill_known_urls:
            known_url_status = 1
            is_current = row['url_id'] in current_url_ids
            is_deleted = row['url_id'] in deleted_url_ids
            if is_current and is_deleted:
                known_url_status = 4
            elif is_deleted:
                known_url_status = 3
            elif is_current:
                known_url_status = 2
            db.add_hydrus_known_url(row['url'], known_url_status)
        for site in siteset:
            accepts, rejects = sites_to_keywords[site]
            url_ok = False
            for accept in accepts:
                if accept in row['url']:
                    url_ok = True
                    break
            if url_ok:
                for reject in rejects:
                    if reject in row['url']: url_ok = False
            if url_ok:
                process_url(row['url'])
    log.info("hydownloader-anchor-exporter", "Done processing URLs")

    if unrecognized_urls_file:
        log.info("hydownloader-anchor-exporter",
                 "Writing unrecognized URLs...")
        with open(unrecognized_urls_file, 'w', encoding='utf-8') as f:
            for url in sorted(suspicious_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing unrecognized URLs")
    if recognized_urls_file:
        log.info("hydownloader-anchor-exporter", "Writing recognized URLs...")
        with open(recognized_urls_file, 'w', encoding='utf-8') as f:
            for url in sorted(recognized_urls):
                f.write(url.strip() + '\n')
        log.info("hydownloader-anchor-exporter",
                 "Done writing recognized URLs")

    log.info("hydownloader-anchor-exporter", "Inserting new anchors...")
    anchor_count = len(anchors.keys())
    processed = 0
    new_anchor_rows = 0
    for anchor in anchors:
        processed += 1
        if processed % 50 == 0:
            print(f"Inserting new anchors {processed}/{anchor_count}",
                  file=sys.stderr)
        final_anchors = [anchor]
        if anchor.startswith("nijie"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("twitter") or anchor.startswith("tumblr"):
            for i in range(anchors[anchor] + 1):
                final_anchors.append(anchor + "_" + str(i))
        if anchor.startswith("pixiv"):
            for i in range(anchors[anchor]):
                final_anchors.append(anchor + "_p{:02d}".format(i))
        for f_a in final_anchors:
            if f_a in known_anchors:
                continue
            ac.execute('insert into archive(entry) values (?)', (f_a, ))
            new_anchor_rows += 1
    log.info(
        "hydownloader-anchor-exporter",
        f"Done inserting new anchors, added {new_anchor_rows} entries in total"
    )

    anchor_db.commit()
    anchor_db.close()
    hydrus_db.close()
    db.shutdown()
def process_additional_data(subscription_id: Optional[int] = None,
                            url_id: Optional[int] = None) -> tuple[int, int]:
    """
    This function scans log files outputted by gallery-dl and tries to recognize filenames in the output.
    Based on which subscription or URL those files belong to, it queries the database for the associated additional_data
    values (from the subscriptions or single_url_queue tables), then inserts these filename + data entries
    into the additional_data database table (even if there is no additional_date for the given files).
    This way it is possible to keep track which files were found by which URL downloads/subscriptions, and correctly
    associate additional data with them (even if the files were not actually downloaded by the URL or sub because
    some earlier download already got them).
    If both the subscription and url ID arguments are None, then it scans all files in the temp directory, otherwise
    exactly one of those must not be None and then it only scans for the file belonging to that URL or subscription.
    When parsing gallery-dl output, it is much better to have false positives (recognize some output lines as filenames which are not)
    than to miss any actual filenames, since invalid filename entries in the additional_data table are not a big deal.
    """
    def is_filepath(candidate: str) -> bool:
        candidate = candidate.strip()
        # return ("/" in candidate or "\\" in candidate) and not candidate.startswith("[") and not "gallery-dl:" in candidate
        return os.path.exists(candidate)

    skipped_count = 0
    new_count = 0
    if subscription_id is not None and os.path.isfile(
            db.get_rootpath() +
            f"/temp/subscription-{subscription_id}-gallery-dl-output.txt"):
        f = open(db.get_rootpath() +
                 f"/temp/subscription-{subscription_id}-gallery-dl-output.txt",
                 'r',
                 encoding='utf-8-sig')
        for line in f:
            line = line.strip()
            if not is_filepath(line):
                log.debug("hydownloader",
                          f"Does not look like a filepath: {line}")
                continue
            if line.startswith("# "):
                log.debug("hydownloader",
                          f"Looks like a skipped filepath: {line}")
                line = line[1:]
                line = line.strip()
                skipped_count += 1
            else:
                log.debug("hydownloader", f"Looks like a new filepath: {line}")
                new_count += 1
            db.associate_additional_data(filename=line,
                                         subscription_id=subscription_id,
                                         no_commit=True)
        db.sync()
        f.close()
        os.remove(
            db.get_rootpath() +
            f"/temp/subscription-{subscription_id}-gallery-dl-output.txt")
    elif url_id is not None and os.path.isfile(
            db.get_rootpath() +
            f"/temp/single-url-{url_id}-gallery-dl-output.txt"):
        f = open(db.get_rootpath() +
                 f"/temp/single-url-{url_id}-gallery-dl-output.txt",
                 'r',
                 encoding='utf-8-sig')
        for line in f:
            line = line.strip()
            if not is_filepath(line):
                log.debug("hydownloader",
                          f"Does not look like a filepath: {line}")
                continue
            if line.startswith("# "):
                log.debug("hydownloader",
                          f"Looks like a skipped filepath: {line}")
                line = line[1:]
                line = line.strip()
                skipped_count += 1
            else:
                log.debug("hydownloader", f"Looks like a new filepath: {line}")
                new_count += 1
            db.associate_additional_data(filename=line,
                                         url_id=url_id,
                                         no_commit=True)
        db.sync()
        f.close()
        os.remove(db.get_rootpath() +
                  f"/temp/single-url-{url_id}-gallery-dl-output.txt")
    else:
        log.info(
            "hydownloader",
            "Checking for any leftover temporary gallery-dl output files...")
        filenames = os.listdir(db.get_rootpath() + "/temp")
        for filename in filenames:
            if match := re.match("single-url-([0-9]+)-gallery-dl-output.txt",
                                 filename.strip()):
                log.info("hydownloader",
                         f"Processing leftover file {filename}...")
                process_additional_data(url_id=int(match.group(1)))
            elif match := re.match(
                    "subscription-([0-9]+)-gallery-dl-output.txt",
                    filename.strip()):
                log.info("hydownloader",
                         f"Processing leftover file {filename}...")
                process_additional_data(subscription_id=int(match.group(1)))
         db.add_log_file_to_parse_queue(l, 'reparse')
 while logfname := db.get_queued_log_file(worker):
     subscription_id = None
     url_id = None
     if m := re.match(r".*(?:\\|\/)single-urls-([0-9]+)-gallery-dl-.*\.txt",
                      logfname):
         url_id = int(m.group(1))
     if m := re.match(
             r".*(?:\\|\/)subscription-([0-9]+)-gallery-dl-.*\.txt",
             logfname):
         subscription_id = int(m.group(1))
     try:
         with open(db.get_rootpath() + "/" + logfname,
                   'r',
                   encoding='utf-8-sig') as logf:
             log.info("hydownloader", f"Parsing log file: {logfname}")
             urls = []
             for line in logf:
                 if m := re.match(
                         r'(?:\[.+\])* (http.*?)(?::[0-9]+)? "[A-Z]+ (\/.*?) HTTP.*',
                         line.strip()):
                     urls.append(m.group(1) + m.group(2))
                 if m := re.match(r".*Starting DownloadJob for '(.*)'$",
                                  line.strip()):
                     urls.append(m.group(1))
             db.add_known_urls(urls,
                               subscription_id=subscription_id,
                               url_id=url_id)
             db.remove_log_file_from_parse_queue(db.get_rootpath() + "/" +
                                                 logfname)
             log.info(
示例#24
0
def subscription_worker() -> None:
    global _sub_worker_ended_flag
    try:
        log.info("hydownloader", "Starting subscription worker thread...")
        with _worker_lock:
            _sub_worker_ended_flag = False
        while True:
            time.sleep(2)
            with _worker_lock:
                if _end_threads_flag:
                    break
            subs_due = db.get_due_subscriptions()
            if not subs_due:
                with _worker_lock:
                    if _sub_worker_paused_flag:
                        set_subscription_worker_status("paused")
                    else:
                        set_subscription_worker_status("nothing to do: checked for due subscriptions, found none")
            sub = subs_due[0] if subs_due else None
            while sub:
                with _worker_lock:
                    if _end_threads_flag:
                        break
                    if _sub_worker_paused_flag:
                        set_subscription_worker_status("paused")
                        break
                initial_check = sub['last_check'] is None
                url = urls.subscription_data_to_url(sub['downloader'], sub['keywords'])
                check_started_time = time.time()
                status_msg = f"checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']})"
                set_subscription_worker_status(status_msg)
                log.info(f"subscription-{sub['id']}", status_msg.capitalize())
                if initial_check:
                    log.info(f"subscription-{sub['id']}", "This is the first check for this subscription")
                result = gallery_dl_utils.run_gallery_dl(
                    url=url,
                    ignore_anchor=False,
                    metadata_only=False,
                    log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-latest.txt",
                    old_log_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-gallery-dl-old.txt",
                    console_output_file=db.get_rootpath()+f"/temp/subscription-{sub['id']}-gallery-dl-output.txt",
                    unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-latest.txt",
                    old_unsupported_urls_file=db.get_rootpath()+f"/logs/subscription-{sub['id']}-unsupported-urls-gallery-dl-old.txt",
                    overwrite_existing=False,
                    filter_=sub['filter'],
                    chapter_filter=None,
                    subscription_mode=True,
                    abort_after=sub['abort_after'],
                    max_file_count = sub['max_files_initial'] if initial_check else sub['max_files_regular']
                    )
                if result:
                    log.warning(f"subscription-{sub['id']}", "Error: "+result)
                else:
                    sub['last_successful_check'] = check_started_time
                sub['last_check'] = check_started_time
                new_files, skipped_files = process_additional_data(subscription_id = sub['id'])
                check_ended_time = time.time()
                db.add_subscription_check(sub['id'], new_files=new_files, already_seen_files=skipped_files, time_started=check_started_time, time_finished=check_ended_time, status=result)
                db.add_or_update_subscriptions([sub])
                status_msg = f"finished checking subscription: {sub['id']} (downloader: {sub['downloader']}, keywords: {sub['keywords']}), new files: {new_files}, skipped: {skipped_files}"
                set_subscription_worker_status(status_msg)
                log.info(f"subscription-{sub['id']}", status_msg.capitalize())
                subs_due = db.get_due_subscriptions()
                sub = subs_due[0] if subs_due else None
            with _worker_lock:
                if _end_threads_flag:
                    break
        with _worker_lock:
            if _end_threads_flag:
                log.info("hydownloader", "Stopping subscription worker thread")
                _sub_worker_ended_flag = True
    except Exception as e:
        log.fatal("hydownloader", "Uncaught exception in subscription worker thread", e)
        shutdown()
示例#25
0
def url_queue_worker() -> None:
    global _url_worker_ended_flag
    try:
        log.info("hydownloader", "Starting single URL queue worker thread...")
        with _worker_lock:
            _url_worker_ended_flag = False
        while True:
            time.sleep(2)
            with _worker_lock:
                if _end_threads_flag:
                    break
            urls_to_dl = db.get_urls_to_download()
            if not urls_to_dl:
                with _worker_lock:
                    if _url_worker_paused_flag:
                        set_url_worker_status("paused")
                    else:
                        set_url_worker_status("nothing to do: checked for queued URLs, found none")
            urlinfo = urls_to_dl[0] if urls_to_dl else None
            while urlinfo:
                with _worker_lock:
                    if _end_threads_flag:
                        break
                    if _url_worker_paused_flag:
                        set_url_worker_status("paused")
                        break
                check_time = time.time()
                status_msg = f"downloading URL: {urlinfo['url']}"
                set_url_worker_status(status_msg)
                log.info("single url downloader", status_msg.capitalize())
                result = gallery_dl_utils.run_gallery_dl(
                    url=urlinfo['url'],
                    ignore_anchor=urlinfo['ignore_anchor'],
                    metadata_only=urlinfo['metadata_only'],
                    log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-latest.txt",
                    old_log_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-gallery-dl-old.txt",
                    console_output_file=db.get_rootpath()+f"/temp/single-url-{urlinfo['id']}-gallery-dl-output.txt",
                    unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-latest.txt",
                    old_unsupported_urls_file=db.get_rootpath()+f"/logs/single-urls-{urlinfo['id']}-unsupported-urls-gallery-dl-old.txt",
                    overwrite_existing=urlinfo['overwrite_existing'],
                    filter_=urlinfo['filter'],
                    chapter_filter=None,
                    subscription_mode=False,
                    max_file_count = urlinfo['max_files']
                    )
                if result:
                    log.warning("single url downloader", f"Error while downloading {urlinfo['url']}: {result}")
                    urlinfo['status'] = 1
                    urlinfo['status_text'] = result
                else:
                    urlinfo['status'] = 0
                    urlinfo['status_text'] = 'ok'
                urlinfo['time_processed'] = check_time
                new_files, skipped_files = process_additional_data(url_id = urlinfo['id'])
                urlinfo['new_files'] = new_files
                urlinfo['already_seen_files'] = skipped_files
                db.add_or_update_urls([urlinfo])
                status_msg = f"finished checking URL: {urlinfo['url']}, new files: {new_files}, skipped: {skipped_files}"
                set_url_worker_status(status_msg)
                log.info("single url downloader", status_msg.capitalize())
                urls_to_dl = db.get_urls_to_download()
                urlinfo = urls_to_dl[0] if urls_to_dl else None
            with _worker_lock:
                if _end_threads_flag:
                    break
        with _worker_lock:
            if _end_threads_flag:
                log.info("hydownloader", "Stopping single URL queue worker thread")
                _url_worker_ended_flag = True
    except Exception as e:
        log.fatal("hydownloader", "Uncaught exception in URL worker thread", e)
        shutdown()
示例#26
0
def run_job(path: str, job: str, config: Optional[str], verbose: bool,
            do_it: bool, no_stop_on_missing_metadata: bool) -> None:
    log.init(path, True)
    db.init(path)

    config_path = db.get_rootpath() + '/hydownloader-import-jobs.json'
    data_path = db.get_datapath()
    if config:
        config_path = config
    if not os.path.isfile(config_path):
        log.fatal("hydownloader-importer",
                  f"Configuration file not found: {config_path}")

    jobs = json.load(open(config_path, 'r', encoding='utf-8-sig'))
    if not job in jobs:
        log.fatal("hydownloader-importer",
                  f"Job not found in configuration file: {job}")
    jd = jobs[job]

    force_add_metadata = jd.get('forceAddMetadata', True)
    force_add_files = jd.get('forceAddFiles', False)

    client = hydrus.Client(jd['apiKey'], jd['apiURL'])

    log.info("hydownloader-importer", f"Starting import job: {job}")

    # iterate over all files in the data directory
    for root, dirs, files in os.walk(data_path):
        for fname in files:
            # json files hold metadata, don't import them to Hydrus
            if fname.endswith('.json'):
                continue

            # set up some variables
            # some will be used later in the code, some are meant to be used in user-defined expressions
            abspath = root + "/" + fname
            path = os.path.relpath(abspath, start=data_path)
            split_path = os.path.split(path)
            fname_noext, fname_ext = os.path.splitext(fname)
            if fname_ext.startswith('.'): fname_ext = fname_ext[1:]

            # find the path of the associated json metadata file, check if it exists
            # for pixiv ugoira, the same metadata file belongs both to the .webm and the .zip,
            # so this needs special handling
            json_path = abspath + '.json'
            if not os.path.isfile(json_path) and abspath.endswith('.webm'):
                json_path = abspath[:-4] + "zip.json"
            json_exists = True
            if not os.path.isfile(json_path):
                json_exists = False
                printerr(f"Warning: no metadata file found for {path}")
                if not no_stop_on_missing_metadata:
                    sys.exit(1)

            generated_urls = set()
            generated_tags: set[tuple[str, str]] = set()
            matched = False  # will be true if at least 1 filter group matched the file
            json_data = None  # this will hold the associated json metadata (if available)

            if verbose: printerr(f"Processing file: {path}...")

            # iterate over all filter groups, do they match this file?
            for group in jd['groups']:
                # evaluate filter, load json metadata if the filter matches and we haven't loaded it yet
                should_process = False
                try:
                    should_process = eval(group['filter'])
                except:
                    printerr(f"Failed to evaluate filter: {group['filter']}")
                    sys.exit(1)
                if not json_data and json_exists:
                    try:
                        json_data = json.load(
                            open(json_path, encoding='utf-8-sig'))
                    except json.decoder.JSONDecodeError:
                        printerr(f"Failed to parse JSON: {json_path}")
                        sys.exit(1)
                if not should_process:
                    continue
                matched = True

                # get the data for this file from the additional_data db table and process it
                # set up some variables that user-defined expressions will be able to use
                additional_data_dicts = db.get_additional_data_for_file(path)
                if not additional_data_dicts and path.endswith('.webm'):
                    additional_data_dicts = db.get_additional_data_for_file(
                        path[:-4] + "zip")
                extra_tags: defaultdict[str, list[str]] = defaultdict(list)
                min_time_added = -1
                max_time_added = -1
                for d in additional_data_dicts:
                    parse_additional_data(extra_tags, d['data'])
                    if min_time_added == -1 or min_time_added > d['time_added']:
                        min_time_added = d['time_added']
                    if max_time_added == -1 or max_time_added < d['time_added']:
                        max_time_added = d['time_added']
                sub_ids = []
                url_ids = []
                for d in additional_data_dicts:
                    if d['subscription_id']:
                        sub_ids.append(str(d['subscription_id']))
                    if d['url_id']:
                        url_ids.append(str(d['url_id']))

                # execute user-defined tag and url generator expressions
                has_error = False
                for dtype, d in [('tag', x) for x in group.get('tags', [])] + [
                    ('url', x) for x in group.get('urls', [])
                ]:
                    skip_on_error = d.get("skipOnError", False)
                    allow_empty = d.get("allowEmpty", False)
                    rule_name = d.get("name")
                    generated_results = []
                    # if the expression is a single string
                    if isinstance(d["values"], str):
                        try:
                            eval_res = eval(d["values"])
                            # check result type: must be string or iterable of strings
                            if isinstance(eval_res, str):
                                generated_results = [eval_res]
                            else:
                                for eval_res_str in eval_res:
                                    if not isinstance(eval_res_str, str):
                                        printerr(
                                            f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {d['values']}"
                                        )
                                        sys.exit(1)
                                    else:
                                        generated_results.append(eval_res_str)
                        except Exception as e:
                            if verbose:
                                printerr(
                                    f"Failed to evaluate expression: {d['values']}"
                                )
                                print(e)
                            has_error = True
                    else:  # multiple expressions (array of strings)
                        for eval_expr in d["values"]:
                            try:
                                eval_res = eval(eval_expr)
                                # check result type: must be string or iterable of strings
                                if isinstance(eval_res, str):
                                    generated_results = [eval_res]
                                else:
                                    for eval_res_str in eval_res:
                                        if not isinstance(eval_res_str, str):
                                            printerr(
                                                f"Invalid result type ({str(type(eval_res_str))}) while evaluating expression: {eval_expr}"
                                            )
                                            sys.exit(1)
                                        else:
                                            generated_results.append(
                                                eval_res_str)
                            except Exception as e:
                                if verbose:
                                    printerr(
                                        f"Failed to evaluate expression: {eval_expr}"
                                    )
                                    printerr(e)
                                has_error = True

                    # check for empty results or failed evaluation, as necessary
                    if not generated_results and not allow_empty:
                        printerr(
                            f"Error: the rule named {rule_name} yielded no results but this is not allowed"
                        )
                        sys.exit(1)
                    if has_error:
                        printerr(
                            f"Warning: an expression failed to evaluate in the rule named {rule_name}"
                        )
                        if not skip_on_error:
                            sys.exit(1)

                    # save results of the currently evaluated expressions
                    if dtype == 'url':
                        generated_urls.update(generated_results)
                    else:
                        for repo in d["tagRepos"]:
                            generated_tags.update(
                                (repo, tag) for tag in generated_results)
            if matched:
                printerr(f"File matched: {path}...")

                if not os.path.getsize(abspath):
                    print(f"Found truncated file: {abspath}")
                    sys.exit(1)

                if verbose:
                    printerr("Generated URLs:")
                    for url in generated_urls:
                        printerr(url)
                    printerr("Generated tags:")
                    for repo, tag in sorted(list(generated_tags),
                                            key=lambda x: x[0]):
                        printerr(f"{repo} <- {tag}")
                if verbose: printerr('Hashing...')

                # calculate hash, check if Hydrus already knows the file
                already_added = False
                if do_it:
                    hasher = hashlib.sha256()
                    with open(abspath, 'rb') as hashedfile:
                        buf = hashedfile.read(65536 * 16)
                        while len(buf) > 0:
                            hasher.update(buf)
                            buf = hashedfile.read(65536 * 16)
                    hexdigest = hasher.hexdigest()
                    if client.file_metadata(hashes=[hexdigest],
                                            only_identifiers=True):
                        printerr("File is already in Hydrus")
                        already_added = True

                # send file, tags, metadata to Hydrus as needed
                if not already_added or force_add_files:
                    if verbose: printerr("Sending file to Hydrus...")
                    if do_it: client.add_file(abspath)
                if not already_added or force_add_metadata:
                    if verbose: printerr("Associating URLs...")
                    if do_it:
                        client.associate_url(hashes=[hexdigest],
                                             add=generated_urls)
                    if verbose: printerr("Adding tags...")
                    tag_dict = defaultdict(list)
                    for repo, tag in generated_tags:
                        tag_dict[repo].append(tag)
                    if do_it:
                        client.add_tags(hashes=[hexdigest],
                                        service_to_tags=tag_dict)
            else:
                if verbose:
                    printerr(f"Skipping due to no matching filter: {path}")

    log.info("hydownloader-importer", f"Finished import job: {job}")
    db.shutdown()
示例#27
0
def report(verbose: bool, urls: bool = True) -> None:
    check_init()
    c = get_conn().cursor()

    def format_date(timestamp: Optional[Union[float, int, str]]) -> str:
        if isinstance(timestamp, str):
            return timestamp
        if timestamp is None:
            return 'never'
        return datetime.datetime.fromtimestamp(float(timestamp)).isoformat()

    log.info('hydownloader-report', 'Generating report...')
    urls_paused = len(c.execute('select * from single_url_queue where paused = 1').fetchall())
    subs_paused = len(c.execute('select * from subscriptions where paused = 1').fetchall())
    urls_errored_entries = c.execute('select * from single_url_queue where status > 0').fetchall()
    urls_errored = len(urls_errored_entries)
    subs_errored_entries = c.execute('select * from subscriptions where last_check is not null and last_successful_check <> last_check').fetchall()
    subs_errored = len(subs_errored_entries)
    urls_no_files_entries = c.execute('select * from single_url_queue where status = 0 and (new_files is null or already_seen_files is null or new_files + already_seen_files = 0)').fetchall()
    urls_no_files = len(urls_no_files_entries)
    subs_no_files_entries = c.execute((
        'select * from subscriptions where last_check is not null and id in '
        '(select subscription_id from subscription_checks group by subscription_id having sum(new_files) + sum(already_seen_files) <= 0)'
    )).fetchall()
    subs_no_files = len(subs_no_files_entries)
    urls_waiting_long_entries = c.execute(f'select * from single_url_queue where time_processed is null and time_added + 86400 <= {time.time()}').fetchall()
    urls_waiting_long = len(urls_waiting_long_entries)
    subs_waiting_long_entries = c.execute((
        f'select * from subscriptions where (last_check is not null and last_check + check_interval <= {time.time()})'
        f'or (last_check is null and time_created + check_interval <= {time.time()})'
    )).fetchall()
    subs_waiting_long = len(subs_waiting_long_entries)
    subs_no_recent_files_entries = c.execute((
        'select * from subscriptions where last_check is not null and id in '
        f'(select subscription_id from subscription_checks where time_started + 30 * 86400 >= {time.time()} group by subscription_id having sum(new_files) + sum(already_seen_files) <= 0)'
        f'or id not in (select subscription_id from subscription_checks group by subscription_id having max(time_started) + 30 * 86400 < {time.time()})'
    )).fetchall()
    subs_no_recent_files = len(subs_no_recent_files_entries)
    subs_queued = len(get_due_subscriptions())
    urls_queued = len(get_urls_to_download())
    all_subs = len(c.execute('select * from subscriptions').fetchall())
    all_urls = len(c.execute('select * from single_url_queue').fetchall())
    all_sub_checks = len(c.execute('select * from subscription_checks').fetchall())
    all_file_results = len(c.execute('select * from additional_data').fetchall())
    last_time_url_processed_results = c.execute('select max(time_processed) t from single_url_queue').fetchall()
    last_time_url_processed = format_date(last_time_url_processed_results[0]['t'] if last_time_url_processed_results else 'never')
    last_time_sub_checked_results = c.execute('select max(time_finished) t from subscription_checks').fetchall()
    last_time_sub_checked = format_date(last_time_sub_checked_results[0]['t'] if last_time_sub_checked_results else 'never')

    def print_url_entries(entries: list[dict]) -> None:
        for url in entries:
            log.info('hydownloader-report', (
                f"URL: {url['url']}, "
                f"status: {url['status_text']} (code: {url['status']}), "
                f"time added: {format_date(url['time_added'])}, "
                f"time processed: {format_date(url['time_processed'])}, "
                f"paused: {url['paused']}"
            ))

    def print_sub_entries(entries: list[dict]) -> None:#keywords,downloader,last_check,last_successful_check, check_interval, paused
        for sub in entries:
            log.info('hydownloader-report', (
                f"Downloader: {sub['downloader']}, "
                f"keywords: {sub['keywords']}, "
                f"last check: {format_date(sub['last_check'])}, "
                f"last successful check: {format_date(sub['last_successful_check'])}, "
                f"check interval: {sub['check_interval']}, "
                f"paused: {sub['paused']}"
            ))

    log.info('hydownloader-report', f'Subscriptions: {all_subs}')
    if urls: log.info('hydownloader-report', f'Single URLs: {all_urls}')
    log.info('hydownloader-report', f'Subscription checks: {all_sub_checks}')
    log.info('hydownloader-report', f'All file results (including duplicates and skipped): {all_file_results}')
    log.info('hydownloader-report', f'Last time a subscription was checked: {last_time_sub_checked}')
    if urls: log.info('hydownloader-report', f'Last time a URL was downloaded: {last_time_url_processed}')
    log.info('hydownloader-report', f'Subscriptions due for a check: {subs_queued}')
    if urls: log.info('hydownloader-report', f'URLs waiting to be downloaded: {urls_queued}')
    log.info('hydownloader-report', f'Paused subscriptions: {subs_paused}')
    if urls: log.info('hydownloader-report', f'Paused URLs: {urls_paused}')
    if urls: log.info('hydownloader-report', f'Errored URLs: {urls_errored}')
    if verbose and urls_errored and urls:
        log.info('hydownloader-report', 'These are the following:')
        print_url_entries(urls_errored_entries)
    log.info('hydownloader-report', f'Errored subscriptions: {subs_errored}')
    if verbose and subs_errored:
        log.info('hydownloader-report', 'These are the following:')
        print_sub_entries(subs_errored_entries)
    if urls: log.info('hydownloader-report', f'URLs that did not error but produced no files: {urls_no_files}')
    if verbose and urls_no_files and urls:
        log.info('hydownloader-report', 'These are the following:')
        print_url_entries(urls_no_files_entries)
    log.info('hydownloader-report', f'Subscriptions that did not error but produced no files: {subs_no_files}')
    if verbose and subs_no_files:
        log.info('hydownloader-report', 'These are the following:')
        print_sub_entries(subs_no_files_entries)
    if urls: log.info('hydownloader-report', f'URLs waiting to be downloaded for more than a day: {urls_waiting_long}')
    if verbose and urls_waiting_long and urls:
        log.info('hydownloader-report', 'These are the following:')
        print_url_entries(urls_waiting_long_entries)
    log.info('hydownloader-report', f'Subscriptions due for a check longer than their check interval: {subs_waiting_long}')
    if verbose and subs_waiting_long:
        log.info('hydownloader-report', 'These are the following:')
        print_sub_entries(subs_waiting_long_entries)
    log.info('hydownloader-report', f'Subscriptions that were checked at least once but did not produce any files in the past 30 days: {subs_no_recent_files}')
    if verbose and subs_no_recent_files:
        log.info('hydownloader-report', 'These are the following:')
        print_sub_entries(subs_no_recent_files_entries)

    log.info('hydownloader-report', 'Report finished')