def export_db_contents(to_path, site_name): print("Dumping contents for site %s to folder %s" % (site_name, to_path)) ids = {} with db.context_sess() as sess: res = sess.query(db.ScrapeTargets) \ .filter(db.ScrapeTargets.site_name == site_name) \ .all() for result in res: ids[result.id] = result.artist_name print("Found %s items!" % (len(ids), )) for uid, uname in tqdm.tqdm(ids.items(), desc="Artists"): with db.context_sess() as sess: posts = sess.query(db.ArtItem) \ .filter(db.ArtItem.artist_id == uid) \ .order_by(db.ArtItem.addtime) \ .all() dat = [] for post in tqdm.tqdm(posts, desc="Artist posts"): tmp = ( uname, post.title, post.content, post.content_structured, list(ptag.tag for ptag in post.tags), list(( pfile.item_id, pfile.seqnum, pfile.file_meta, pfile.state, pfile.fspath, pfile.filename, ) for pfile in post.files), ) dat.append(tmp) outp = os.path.join(to_path, "%s-%s-%s-dump.json" % (site_name, uname, uid)) with open(outp, "w") as fp: json.dump(dat, fp, indent=4)
def db_misrelink_clean(): print('Misrelink cleaning') with db.context_sess() as sess: print("Geting duplicates") dupes = get_duplicated_attrs(sess, db.ArtItem, "release_meta") print("Duplicates:") for dupe_url, in dupes: releases = sess.query(db.ArtItem).options(joinedload('artist')) \ .filter(db.ArtItem.release_meta == dupe_url).all() for release in releases: check_item(sess, release) releases_2 = sess.query(db.ArtItem).options(joinedload('artist')) \ .filter(db.ArtItem.release_meta == dupe_url).all() if len(releases_2) > 1: merge_release_set(sess, releases)
def db_name_clean(): print("db clean") with db.context_sess() as sess: artists = sess.query(db.ScrapeTargets).all() amap = {} for artist in tqdm.tqdm(artists): akey = (artist.site_name, artist.artist_name.lower().strip()) if akey in amap: print("Duplicate: ", akey) print(artist.site_name, artist.artist_name) print(amap[akey].site_name, amap[akey].artist_name) if artist.artist_name.strip() == artist.artist_name: good = artist bad = amap[akey] else: good = amap[akey] bad = artist print("Remove: %s -> '%s'" % (bad.site_name, bad.artist_name)) print("Keep: %s -> '%s'" % (good.site_name, good.artist_name)) # Frankly, if a site *isn't* case insensitive, I think they have a design flaw, # but it's something I consider. case_insensitive_sites = [ 'da', 'wy', 'ib', 'sf', 'hf', 'fa', 'tum' ] if artist.site_name in case_insensitive_sites: print("Deleting duplicate.") consolidate_artist(sess, bad, good) sess.delete(bad) sess.commit() else: if artist.artist_name.strip() != artist.artist_name: print("Fixing whitespace: ", (artist.artist_name.strip(), artist.artist_name)) artist.artist_name = artist.artist_name.strip() sess.commit() amap[akey] = artist sess.commit()
def dump_item_meta(): print("dump_item_meta") with db.context_sess() as sess: print("Counting") tot = sess.query(db.ArtFile).count() print("Have %s rows" % (tot)) res_q = sess.query(db.ArtFile) print("Doing query") for row in tqdm.tqdm(res_q.yield_per(500), total=tot): tags = [tag.tag for tag in row.item.tags] artist_name = row.item.artist.artist_name site_name = row.item.artist.site_name title = row.item.title content = row.item.content content_structured = row.item.content_structured filename = row.fspath src_filename = row.filename original_file_name = os.path.join(settings['dldCtntPath'], filename) write_to = os.path.join(settings['dldCtntPath'], filename + ".json") if os.path.exists(original_file_name): with open(write_to, "w") as fp: fp.write( json.dumps( { "tags": tags, "artist_name": artist_name, "site_name": site_name, "title": title, "content": content, "content_structured": content_structured, "filename": filename, "src_filename": src_filename, }, indent=4, ))
def db_name_clean(): print("db clean") with db.context_sess() as sess: artists = sess.query(db.ScrapeTargets).all() amap = {} for artist in tqdm.tqdm(artists): akey = (artist.site_name, artist.artist_name.lower().strip()) if akey in amap: print("Duplicate: ", akey) print(artist.site_name, artist.artist_name) print(amap[akey].site_name, amap[akey].artist_name) if artist.artist_name.strip() == artist.artist_name: good = artist bad = amap[akey] else: good = amap[akey] bad = artist print("Remove: %s -> '%s'" % (bad.site_name, bad.artist_name)) print("Keep: %s -> '%s'" % (good.site_name, good.artist_name)) # Frankly, if a site *isn't* case insensitive, I think they have a design flaw, # but it's something I consider. case_insensitive_sites = ['da', 'wy', 'ib', 'sf', 'hf', 'fa', 'tum'] if artist.site_name in case_insensitive_sites: print("Deleting duplicate.") consolidate_artist(sess, bad, good) sess.delete(bad) sess.commit() else: if artist.artist_name.strip() != artist.artist_name: print("Fixing whitespace: ", (artist.artist_name.strip(), artist.artist_name)) artist.artist_name = artist.artist_name.strip() sess.commit() amap[akey] = artist sess.commit()
def reset_last_fetched_times(plugin_name=None): for key, (plugin_cls, readable_name) in ENABLED_PLUGINS.items(): if plugin_name and plugin_name != key: continue instance = plugin_cls() with db.context_sess() as sess: res = sess.query(db.ScrapeTargets) \ .filter(db.ScrapeTargets.site_name == key) \ .all() names = [row.artist_name for row in res] sess.commit() for name in tqdm.tqdm(names): instance.update_last_fetched(artist=name, fetch_time=datetime.datetime.min, force=True) # print("Names:", len(names)) print(key, plugin_name, readable_name)