Пример #1
0
def export_db_contents(to_path, site_name):
    print("Dumping contents for site %s to folder %s" % (site_name, to_path))

    ids = {}
    with db.context_sess() as sess:
        res = sess.query(db.ScrapeTargets)             \
         .filter(db.ScrapeTargets.site_name == site_name) \
         .all()
        for result in res:
            ids[result.id] = result.artist_name

    print("Found %s items!" % (len(ids), ))

    for uid, uname in tqdm.tqdm(ids.items(), desc="Artists"):
        with db.context_sess() as sess:
            posts = sess.query(db.ArtItem) \
             .filter(db.ArtItem.artist_id == uid) \
             .order_by(db.ArtItem.addtime) \
             .all()

            dat = []
            for post in tqdm.tqdm(posts, desc="Artist posts"):
                tmp = (
                    uname,
                    post.title,
                    post.content,
                    post.content_structured,
                    list(ptag.tag for ptag in post.tags),
                    list((
                        pfile.item_id,
                        pfile.seqnum,
                        pfile.file_meta,
                        pfile.state,
                        pfile.fspath,
                        pfile.filename,
                    ) for pfile in post.files),
                )
                dat.append(tmp)

            outp = os.path.join(to_path,
                                "%s-%s-%s-dump.json" % (site_name, uname, uid))
            with open(outp, "w") as fp:
                json.dump(dat, fp, indent=4)
Пример #2
0
def db_misrelink_clean():
	print('Misrelink cleaning')

	with db.context_sess() as sess:

		print("Geting duplicates")
		dupes = get_duplicated_attrs(sess, db.ArtItem, "release_meta")

		print("Duplicates:")
		for dupe_url, in dupes:
			releases = sess.query(db.ArtItem).options(joinedload('artist')) \
				.filter(db.ArtItem.release_meta == dupe_url).all()
			for release in releases:
				check_item(sess, release)

			releases_2 = sess.query(db.ArtItem).options(joinedload('artist')) \
				.filter(db.ArtItem.release_meta == dupe_url).all()
			if len(releases_2) > 1:
				merge_release_set(sess, releases)
Пример #3
0
def db_misrelink_clean():
    print('Misrelink cleaning')

    with db.context_sess() as sess:

        print("Geting duplicates")
        dupes = get_duplicated_attrs(sess, db.ArtItem, "release_meta")

        print("Duplicates:")
        for dupe_url, in dupes:
            releases = sess.query(db.ArtItem).options(joinedload('artist')) \
             .filter(db.ArtItem.release_meta == dupe_url).all()
            for release in releases:
                check_item(sess, release)

            releases_2 = sess.query(db.ArtItem).options(joinedload('artist')) \
             .filter(db.ArtItem.release_meta == dupe_url).all()
            if len(releases_2) > 1:
                merge_release_set(sess, releases)
Пример #4
0
def db_name_clean():
    print("db clean")
    with db.context_sess() as sess:
        artists = sess.query(db.ScrapeTargets).all()

        amap = {}
        for artist in tqdm.tqdm(artists):
            akey = (artist.site_name, artist.artist_name.lower().strip())
            if akey in amap:
                print("Duplicate: ", akey)
                print(artist.site_name, artist.artist_name)
                print(amap[akey].site_name, amap[akey].artist_name)

                if artist.artist_name.strip() == artist.artist_name:
                    good = artist
                    bad = amap[akey]
                else:
                    good = amap[akey]
                    bad = artist

                print("Remove: %s -> '%s'" % (bad.site_name, bad.artist_name))
                print("Keep:   %s -> '%s'" %
                      (good.site_name, good.artist_name))

                # Frankly, if a site *isn't* case insensitive, I think they have a design flaw,
                # but it's something I consider.
                case_insensitive_sites = [
                    'da', 'wy', 'ib', 'sf', 'hf', 'fa', 'tum'
                ]
                if artist.site_name in case_insensitive_sites:
                    print("Deleting duplicate.")
                    consolidate_artist(sess, bad, good)

                    sess.delete(bad)
                    sess.commit()
            else:
                if artist.artist_name.strip() != artist.artist_name:
                    print("Fixing whitespace: ",
                          (artist.artist_name.strip(), artist.artist_name))
                    artist.artist_name = artist.artist_name.strip()
                    sess.commit()
                amap[akey] = artist
        sess.commit()
Пример #5
0
def dump_item_meta():
    print("dump_item_meta")
    with db.context_sess() as sess:
        print("Counting")
        tot = sess.query(db.ArtFile).count()

        print("Have %s rows" % (tot))
        res_q = sess.query(db.ArtFile)

        print("Doing query")
        for row in tqdm.tqdm(res_q.yield_per(500), total=tot):

            tags = [tag.tag for tag in row.item.tags]
            artist_name = row.item.artist.artist_name
            site_name = row.item.artist.site_name
            title = row.item.title
            content = row.item.content
            content_structured = row.item.content_structured
            filename = row.fspath
            src_filename = row.filename

            original_file_name = os.path.join(settings['dldCtntPath'],
                                              filename)
            write_to = os.path.join(settings['dldCtntPath'],
                                    filename + ".json")

            if os.path.exists(original_file_name):
                with open(write_to, "w") as fp:
                    fp.write(
                        json.dumps(
                            {
                                "tags": tags,
                                "artist_name": artist_name,
                                "site_name": site_name,
                                "title": title,
                                "content": content,
                                "content_structured": content_structured,
                                "filename": filename,
                                "src_filename": src_filename,
                            },
                            indent=4,
                        ))
Пример #6
0
def db_name_clean():
	print("db clean")
	with db.context_sess() as sess:
		artists = sess.query(db.ScrapeTargets).all()

		amap = {}
		for artist in tqdm.tqdm(artists):
			akey = (artist.site_name, artist.artist_name.lower().strip())
			if akey in amap:
				print("Duplicate: ", akey)
				print(artist.site_name,     artist.artist_name)
				print(amap[akey].site_name, amap[akey].artist_name)

				if artist.artist_name.strip() == artist.artist_name:
					good = artist
					bad = amap[akey]
				else:
					good = amap[akey]
					bad = artist

				print("Remove: %s -> '%s'" % (bad.site_name, bad.artist_name))
				print("Keep:   %s -> '%s'" % (good.site_name, good.artist_name))

				# Frankly, if a site *isn't* case insensitive, I think they have a design flaw,
				# but it's something I consider.
				case_insensitive_sites = ['da', 'wy', 'ib', 'sf', 'hf', 'fa', 'tum']
				if artist.site_name in case_insensitive_sites:
					print("Deleting duplicate.")
					consolidate_artist(sess, bad, good)

					sess.delete(bad)
					sess.commit()
			else:
				if artist.artist_name.strip() != artist.artist_name:
					print("Fixing whitespace: ", (artist.artist_name.strip(), artist.artist_name))
					artist.artist_name = artist.artist_name.strip()
					sess.commit()
				amap[akey] = artist
		sess.commit()
Пример #7
0
def reset_last_fetched_times(plugin_name=None):

    for key, (plugin_cls, readable_name) in ENABLED_PLUGINS.items():
        if plugin_name and plugin_name != key:
            continue

        instance = plugin_cls()

        with db.context_sess() as sess:
            res = sess.query(db.ScrapeTargets) \
             .filter(db.ScrapeTargets.site_name == key) \
             .all()

            names = [row.artist_name for row in res]
            sess.commit()

        for name in tqdm.tqdm(names):
            instance.update_last_fetched(artist=name,
                                         fetch_time=datetime.datetime.min,
                                         force=True)
        # print("Names:", len(names))

        print(key, plugin_name, readable_name)