Exemplo n.º 1
0
    def consolidate_history_new(self):

        with db.session_context(override_timeout_ms=1000 * 60 * 60 *
                                6) as sess:
            self.qlog.info("Querying for items with significant history size")
            end = sess.execute("""
					SELECT
						min(id), max(id)
					FROM
						web_pages_version
				""")
            start, end = list(end)[0]
            self.qlog.info("Database Extents: %s -> %s", start, end)

            sess.flush()
            sess.expire_all()

        self.url_hit_list = set()

        step = 50000
        start = start - (start % step)
        pbar = tqdm.tqdm(range(start, end, step), position=1)

        # m_tracker = tracker.SummaryTracker()

        delta = 0

        deleted = 0
        for x in pbar:
            # with db.session_context(override_timeout_ms=1000*60*30) as sess:
            with db.session_context(override_timeout_ms=1000 * 60 * 60,
                                    quiet_override=True) as sess:
                pbar.set_description("Deleted %s. Processed %s urls" %
                                     (deleted, len(self.url_hit_list)))
                try:
                    changed = self.truncate_url_range(sess, x, x + step)
                    deleted += changed

                except sqlalchemy.exc.OperationalError:
                    self.log.error("Error in range section %s -> %s", x,
                                   x + step)
                    for line in traceback.format_exc().split("\n"):
                        self.log.error(line)
                    sess.rollback()

                delta += 1
                if delta > 5000:
                    delta = 0
Exemplo n.º 2
0
def exposed_delete_transactions():
    '''
	List netlocs from database that aren't in the rules.
	'''

    with db.session_context() as sess:

        print("Getting minimum transaction table date..")
        start = sess.execute("""SELECT min(issued_at) FROM transaction""")
        start = unwrap_ret(start)
        print("Minimum transaction time: ", start,
              "getting maximum transaction table date...")
        stop = sess.execute("""SELECT max(issued_at) FROM transaction""")
        stop = unwrap_ret(stop)
        print("Maximum transaction time: ", stop)

        step = datetime.timedelta(hours=8)
        mind = start
        while mind < stop:
            print("Doing delete from %s to %s" % (mind, mind + step))

            have = sess.execute(
                """DELETE FROM transaction WHERE issued_at >= :startd AND issued_at <= :stopd;""",
                {
                    'startd': mind,
                    'stopd': mind + step
                })
            print('Deleted %6i rows. Committing...' % (have.rowcount, ))
            sess.commit()
            print('Comitted')

            # print()

            mind += step
Exemplo n.º 3
0
def get_high_priority_urls(filter_before=None):
    '''
	Exec time: ~0.2 seconds
	'''
    print("Loading high priority netlocs")
    with db.session_context() as sess:

        query = sess.query(db.WebPages.url)                      \
         .filter(db.WebPages.priority <= db.DB_HIGH_PRIORITY) \
         .filter(db.WebPages.is_text  == True)                \
         .yield_per(10000)

        if filter_before:
            query = query.filter(
                db.NuReleaseItem.release_date >= filter_before)

        page_items = query.all()

        mapdict = {}
        for row, in tqdm.tqdm(page_items):
            itemnl = WebMirror.OutputFilters.util.feedNameLut.patch_blogspot(
                urllib.parse.urlsplit(row).netloc)
            mapdict.setdefault(itemnl, set())
            mapdict[itemnl].add(row)

    print("High Priority outbound items: ", len(mapdict))

    return mapdict
Exemplo n.º 4
0
def exposed_delete_feed(feed_name, do_delete, search_str):
    '''
	Feed name is the readable name of the feed, from feedNameLut.py.
	do delete is a boolean that determines if the deletion is actually done, or the actions are
		just previewed. Unless do_delete.lower() == "true", no action will actually be
		taken.
	search_str is the string of items to search for. Searches are case sensitive, and the only
		component of the feed that are searched within is the title.
		search_str is split on the literal character "|", for requiring multiple substrings
		be in the searched title.

	Delete the rss entries for a feed, using a search key.

	'''

    with db.session_context() as sess:
        items = sess.query(db.RssFeedPost)               \
         .filter(db.RssFeedPost.feed_entry.feed_name == feed_name) \
         .all()

        do_delete = "true" in do_delete.lower()

        searchitems = search_str.split("|")
        for item in items:
            itemall = " ".join([item.title] + item.tags)
            if all([searchstr in itemall for searchstr in searchitems]):
                print(itemall)
                if do_delete:
                    print("Deleting item")
                    sess.delete(item)

        sess.commit()
Exemplo n.º 5
0
def do_link_batch_update(logger,
                         link_batch,
                         max_pri=None,
                         show_progress=False):
    try:
        for x in range(10):
            try:
                with db.session_context(name='sess-{}'.format(x)) as sess:
                    do_link_batch_update_sess(logger, sess, link_batch)
                return
            except psycopg2.DatabaseError:
                print("Psycopg2 error. Retrying!")
                if x > 3:
                    raise

    except Exception:
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        print("ERROR")
        traceback.print_exc()
        print("ERROR")
        print("ERROR")
        raise
Exemplo n.º 6
0
def exposed_longest_rows():
    '''
	Fetch the rows from the database where the `content` field is longest.
	Return is limited to the biggest 50 rows.
	VERY SLOW (has to scan the entire table)
	'''
    with db.session_context() as sess:
        print("Getting longest rows from database")
        have = sess.execute("""
			SELECT
				id, url, length(content), content
			FROM
				web_pages
			ORDER BY
				LENGTH(content) DESC NULLS LAST
			LIMIT 50;
			""")
        print("Rows:")
        savepath = "./large_files/"
        for row in have:
            print(row[0], row[1])
            try:
                os.makedirs(savepath)
            except FileExistsError:
                pass
            with open(os.path.join(savepath, "file %s.txt" % row[0]),
                      "wb") as fp:
                urlst = "URL: %s\n\n" % row[1]
                size = "Length: %s\n\n" % row[2]
                fp.write(urlst.encode("utf-8"))
                fp.write(size.encode("utf-8"))
                fp.write("{}".format(row[3]).encode("utf-8"))
Exemplo n.º 7
0
def exposed_delete_spcnet_invalid_url_pages():
	'''
	So the spcnet.tv forum software generates THOUSANDS of garbage links somehow.
	Anyways, delete those.
	'''
	with db.session_context() as sess:
		tables = [
			db.WebPages.__table__,
			version_table(db.WebPages.__table__)
		]

		for ctbl in tables:
			# Print Querying for affected rows
			q = sess.query(ctbl.c.id) \
				.filter(ctbl.c.netloc == "www.spcnet.tv") \
				.filter(ctbl.c.content.like('%Invalid Forum specified. If you followed a valid link, please notify the%'))
			print("Query:")
			print(q)
			ids = q.all()

			ids = set(ids)

			# Returned list of IDs is each ID packed into a 1-tuple. Unwrap those tuples so it's just a list of integer IDs.
			ids = [tmp[0] for tmp in ids]

			print("Fount %s rows requring deletion. Deleting." % len(ids))
			delete_internal(sess, ids)
			sess.commit()
Exemplo n.º 8
0
def do_db_sync():
	with db.session_context() as sess:
		res = sess.query(db.RssFeedEntry) \
			.all()
		have_funcs = {row.feed_name : (row.func, row.last_changed) for row in res}
		sess.commit()

		this_dir = os.path.dirname(__file__)
		func_json_path = os.path.join(this_dir, "function_database.json")

	file_funcs = {}
	try:
		if os.path.exists(func_json_path):
			with open(func_json_path, "r") as fp:
				data = fp.read()
				if data:
					file_funcs = json.loads(data)
	except json.JSONDecodeError:
		pass

	if have_funcs == file_funcs:
		print("Function storage file is up-to-date. Nothing to do!")
		return

	print("Updating function database file.")
	def datetime_handler(x):
		if isinstance(x, datetime.datetime):
			return x.isoformat()
		raise TypeError("Unknown type")

	with open(func_json_path, "w") as fp:
		json.dump(have_funcs, fp, indent=True, sort_keys=True, default=datetime_handler)
Exemplo n.º 9
0
def exposed_astor_roundtrip_parser_functions():
	'''
	Shove the feed-functions through the astor "round-trip"
	facility.

	Mostly, this homogenizes the indentation, and reformats the function.
	'''

	with db.session_context() as sess:
		res = sess.query(db.RssFeedEntry) \
			.all()

		for row in res:
			func = row.get_func()
			_ast = row._get_ast()
			src = astor.to_source(_ast, indent_with="	", pretty_source=better_pretty_source)

			if src.strip() != row.func.strip():
				try:
					rfdb.str_to_function(src, "testing_compile")
					print("Compiled OK")
					row.func = src
				except Exception:
					print("Compilation failed?")
		sess.commit()
Exemplo n.º 10
0
    def _getFontLuts(self, fonturls):

        ret = {}

        for key, fonturl_list in fonturls.items():
            ret[key] = []
            for fonturl in fonturl_list:
                self.log.info("Building font remap LUT for font at %s",
                              fonturl)
                with db.session_context() as sess:
                    with WebMirror.API.getPageRow(fonturl,
                                                  ignore_cache=False,
                                                  session=sess) as page:
                        try:
                            page.fetch(ignore_cache=False)
                            _, _, content = page.getResource()
                        except AssertionError:
                            page.fetch(ignore_cache=True)
                            _, _, content = page.getResource()

                        cmap = defont(io.BytesIO(content), fonturl)
                        if cmap:
                            # self.log.info("Font remap contains %s remapped code-points", len(cmap))
                            ret[key].append(cmap)

        # I'm unclear why just this one char is being missed.
        if 'arial-kcds' in ret:
            ret['arial-kcds']["걣"] = "A"

        self.log.info("Found %s remapped fonts", len(ret))

        return ret
Exemplo n.º 11
0
	def go(self):
		self.log.info("Fetching URLs via local fetcher!")

		for url in self.urls:
			with db.session_context() as sess:
				archiver = SiteArchiver(None, sess, None)
				archiver.synchronousJobRequest(url, ignore_cache=True, debug=True)
Exemplo n.º 12
0
	def fix_missing_history(self):

		with db.session_context() as sess:
			self.qlog.info("Querying for DB items without any history")
			end = sess.execute("""
				SELECT
					t1.url
				FROM
					web_pages t1
				LEFT JOIN
					web_pages_version t2 ON t2.url = t1.url
				WHERE
					t2.url IS NULL

				""")
			end = [tmp[0] for tmp in end]
			self.log.info("Found %s rows missing history content!", len(end))

			loop = 0
			remaining = len(end)
			for urlset in batch(end, 50):
				self.tickle_rows(sess, urlset)
				sess.expire_all()

				remaining = remaining - len(urlset)
				self.log.info("Processed %s of %s (%s%%)", len(end)-remaining, len(end), 100-((remaining/len(end)) * 100) )

				print("Growth:")
				growth = objgraph.show_growth(limit=10)
				print(growth)
Exemplo n.º 13
0
def exposed_delete_nu_unresolved():
    '''
	Delete all nu head system rows that have not been reviewed.

	This is needed for historical purges, particularly if
	nu changes their extnu ids, or if the url masking
	mechanism has significant changes.
	'''
    with db.session_context() as sess:

        count = 0
        print("Loading rows....")
        rows = sess.query(db.NuReleaseItem) \
         .options(joinedload('resolved'))    \
         .all()
        print("Loaded %s rows. Scanning." % len(rows))
        for row in rows:

            if len(list(row.resolved)) == 0 and row.reviewed == 'unverified':

                print(row.id, len(list(row.resolved)), row.referrer)
                for bad in row.resolved:
                    sess.delete(bad)
                sess.delete(row)
                count += 1
                if count % 500 == 0:
                    print("Committing!")
                    sess.commit()

        print("Committing!")
        sess.commit()
Exemplo n.º 14
0
	def review_probable_validated(self):
		self.log.info("Doing optional validation")
		with db.session_context() as db_sess:
			new_items = db_sess.query(db.NuReleaseItem)           \
					.filter(db.NuReleaseItem.validated == True)        \
					.filter(db.NuReleaseItem.reviewed == 'unverified') \
					.filter(db.NuReleaseItem.actual_target != None)    \
					.order_by(desc(db.NuReleaseItem.first_seen))       \
					.all()


			unverified = db_sess.query(db.NuReleaseItem)           \
					.filter(db.NuReleaseItem.validated == False)        \
					.filter(db.NuReleaseItem.actual_target != None)    \
					.count()

			self.log.info("Have %s items to do validity checks on", len(new_items))
			self.log.info("%s items needing checking", unverified)

			ok = 0
			failed = 0
			for row in new_items:
				did_validate = self.review_probable_validated_row(row)
				if did_validate:
					ok += 1
				else:
					failed += 1

			self.log.info("Row checker reviewd %s successfully, failed %s items", ok, failed)

			db_sess.commit()
Exemplo n.º 15
0
def exposed_process_qidian_feeds():
    '''

	'''

    with db.session_context() as sess:

        parser = WebMirror.processor.RssProcessor.RssProcessor(
            loggerPath="Main.RssDb",
            pageUrl='http://www.example.org',
            pgContent='',
            type='application/atom+xml',
            transfer=False,
            debug_print=True,
            db_sess=sess,
            write_debug=False)

        print("Getting feed items....")

        feed_item = sess.query(db.RssFeedEntry) \
          .filter(db.RssFeedEntry.feed_name == "Qidian")    \
          .one()

        feed_url = feed_item.urls[0].feed_url
        pfunc = feed_item.get_func()

        missing = []

        for release in feed_item.releases:
            item = {}
            item['title'] = release.title
            item['guid'] = release.contentid
            item['linkUrl'] = release.contenturl

            item['feedUrl'] = feed_url
            item['srcname'] = "wat"
            item['published'] = "wat"

            ret = pfunc(item)
            if not ret:
                missing.append(release.contenturl)

        urls = {}
        for url in missing:
            root, _ = url.rsplit("/", 1)
            urls[root] = url

        wg = common.util.webFunctions.WebGetRobust()

        lines = []
        for root, url in urls.items():
            urlfrag = root.split("www")[-1]
            meta = common.management.util.get_page_title(wg, url)
            title = meta['title']
            outstr = "		('www{}/', '{}', 'translated'),".format(urlfrag, title)
            lines.append(outstr)

        for outstr in lines:
            print(outstr)
Exemplo n.º 16
0
def test_jt_big_page_flatten():

    print("Trying to flatten huge history")

    giant_history = 'http://japtem.com/fanfic.php'

    proc = DbFlattener()
    with db.session_context() as sess:
        proc.truncate_url_history(sess, giant_history)
Exemplo n.º 17
0
def exposed_process_nu_pages(transmit=True):
    '''
	Re-process all locally saved novelupdates pages.
	'''

    wg = WebRequest.WebGetRobust()
    with db.session_context() as sess:

        if transmit == True:
            print("Transmitting processed results")
            rm = common.RunManager.Crawler(1, 1)
            message_q = rm.start_aggregator()
        else:
            print("Not translating processed results")
            message_q = queue.Queue()

        pages = []
        print("Beginning DB retreival")
        for row in sess.query(db.WebPages) \
         .filter(db.WebPages.netloc == "www.novelupdates.com") \
         .filter(db.WebPages.url.ilike("%/series/%")) \
         .yield_per(50).all():

            rowtmp = {
                "pageUrl": row.url,
                "pgContent": row.content,
                "type": row.mimetype,
                "wg": wg,
                "message_q": message_q,
            }
            pages.append(rowtmp)

            if len(pages) % 100 == 0:
                print("Loaded %s pages..." % len(pages))
        sess.flush()
        sess.commit()
        for row in pages:
            try:
                # print(row, row.url, row.state)
                if row['pgContent'] and NuSeriesPageFilter.NUSeriesPageFilter.wantsUrl(
                        row['pageUrl']):
                    proc = NuSeriesPageFilter.NUSeriesPageFilter(db_sess=sess,
                                                                 **row)
                    proc.extractContent()
            except Exception:
                print("")
                print("ERROR!")
                for line in traceback.format_exc().split("\n"):
                    print(line.rstrip())
                print("")
            except KeyboardInterrupt:
                break

        runStatus.run_state.value = 0

        if transmit == True:
            rm.join_aggregator()
Exemplo n.º 18
0
	def truncate_transaction_table(self):
		with db.session_context() as sess:
			self.qlog.info("Deleting items in transaction table")
			sess.execute("""TRUNCATE transaction;""")
			sess.execute("COMMIT;")
			self.qlog.info("Vacuuming table")
			sess.execute("""VACUUM VERBOSE transaction;""")
			sess.execute("COMMIT;")
			self.qlog.info("Table truncated!")
Exemplo n.º 19
0
    def consolidate_history(self):

        with db.session_context() as sess:
            self.qlog.info("Querying for items with significant history size")
            end = sess.execute("""
					SELECT
						count(*), url
					FROM
						web_pages_version
					GROUP BY
						url
					HAVING
						COUNT(*) > 10
					ORDER BY url

				""")
            end = list(end)
            self.qlog.info(
                "Found %s items with more then 10 history entries. Processing",
                len(end))

            sess.flush()
            sess.expire_all()

        remaining = len(end)
        for batched in batch(end, 50):
            for count, url in batched:

                with db.session_context() as temp_sess:
                    while 1:
                        try:
                            self.truncate_url_history(temp_sess, url)
                            break
                        except sqlalchemy.exc.OperationalError:
                            temp_sess.rollback()

            remaining = remaining - len(batched)
            self.log.info("Processed %s of %s (%s%%)",
                          len(end) - remaining, len(end),
                          100 - ((remaining / len(end)) * 100))

            print("Growth:")
            growth = objgraph.show_growth(limit=10)
            print(growth)
Exemplo n.º 20
0
def exposed_drop_priorities():
    '''
	Reset the priority of every row in the table to the IDLE_PRIORITY level
	'''

    step = 10000

    with db.session_context() as sess:
        print("Getting minimum row in need or update..")
        start = sess.execute(
            """SELECT min(id) FROM web_pages WHERE priority != 500000""")
        start = list(start)[0][0]
        print("Minimum row ID: ", start, "getting maximum row...")
        stop = sess.execute(
            """SELECT max(id) FROM web_pages WHERE priority != 500000""")
        stop = list(stop)[0][0]
        print("Maximum row ID: ", stop)

        if not start:
            print("No null rows to fix!")
            return

        print("Need to fix rows from %s to %s" % (start, stop))
        start = start - (start % step)

        changed = 0
        for idx in range(start, stop, step):
            try:
                # SQL String munging! I'm a bad person!
                # Only done because I can't easily find how to make sqlalchemy
                # bind parameters ignore the postgres specific cast
                # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates
                have = sess.execute(
                    """update web_pages set priority = 500000 where priority != 500000 AND id > {} AND id <= {};"""
                    .format(idx, idx + step))
                # print()

                processed = idx - start
                total_todo = stop - start
                print('%10i, %10i, %7.4f, %6i' %
                      (idx, stop, processed / total_todo * 100, have.rowcount))
                changed += have.rowcount
                if changed > step:
                    print("Committing (%s changed rows)...." % changed,
                          end=' ')
                    sess.commit()
                    print("done")
                    changed = 0

            except sqlalchemy.exc.OperationalError:
                sess.rollback()
            except sqlalchemy.exc.InvalidRequestError:
                sess.rollback()

        sess.commit()
Exemplo n.º 21
0
	def timestamp_validated(self):
		self.log.info("Applying a timestamp to all newly validated rows!")
		with db.session_context() as db_sess:
			unstamped = db_sess.query(db.NuReleaseItem)      \
				.filter(db.NuReleaseItem.validated == True) \
				.filter(db.NuReleaseItem.validated_on == None) \
				.all()

			for item in unstamped:
				item.validated_on = datetime.datetime.now()

			db_sess.commit()
Exemplo n.º 22
0
def check_init_func():
	with db.session_context() as sess:
		raw_cur = sess.connection().connection.cursor()

		cmd = """
			CREATE OR REPLACE FUNCTION upsert_link_raw(
					url_v text,
					starturl_v text,
					netloc_v text,
					distance_v integer,
					priority_v integer,
					addtime_v timestamp without time zone,
					state_v dlstate_enum,
					upsert_epoch_v integer
					)
				RETURNS VOID AS $$

				INSERT INTO
					raw_web_pages
					(url, starturl, netloc, distance, priority, addtime, state, epoch)
				-- 	 (url, starturl, netloc, distance, priority, addtime, state, epoch)
				VALUES
					(     url_v,   starturl_v,   netloc_v,   distance_v,   priority_v,   addtime_v,   state_v, upsert_epoch_v)
				ON CONFLICT (url) DO
					UPDATE
						SET
							state           = EXCLUDED.state,
							starturl        = EXCLUDED.starturl,
							netloc          = EXCLUDED.netloc,
							epoch           = LEAST(EXCLUDED.epoch, raw_web_pages.epoch),
							-- Largest distance is 100, but it's not checked
							distance        = LEAST(EXCLUDED.distance, raw_web_pages.distance),
							-- The lowest priority is 10.
							priority        = LEAST(EXCLUDED.priority, raw_web_pages.priority, 10),
							addtime         = LEAST(EXCLUDED.addtime, raw_web_pages.addtime)
						WHERE
						(
								(raw_web_pages.epoch IS NULL or raw_web_pages.epoch < upsert_epoch_v)
							AND
								raw_web_pages.url = EXCLUDED.url
							AND
								(raw_web_pages.state = 'complete' OR raw_web_pages.state = 'error' OR raw_web_pages.state = 'skipped')
						)
					;

			$$ LANGUAGE SQL;

		"""
		raw_cur.execute(cmd)
		raw_cur.execute("COMMIT;")
Exemplo n.º 23
0
	def clear_rss_history(self):
		self.log.info("Clearing RSS history")


		with db.session_context(override_timeout_ms=30 * 60 * 1000) as sess:
			for url_set in tqdm.tqdm(list(batch(self.feed_urls, n=5))):
				end = sess.execute("""
					DELETE FROM
						web_pages_version
					WHERE
						url IN :urls
					""", {'urls' : tuple(url_set)})
				self.log.info("Removed %s entries for URLs %s", end.rowcount, url_set )
				sess.commit()
Exemplo n.º 24
0
	def incremental_consolidate(self, batched):

		for count, url in batched:
			with db.session_context(override_timeout_ms=1000*60*30) as temp_sess:
				while 1:
					try:
						self.truncate_url_history(temp_sess, url)
						break
					except psycopg2.InternalError:
						temp_sess.rollback()
					except sqlalchemy.exc.OperationalError:
						temp_sess.rollback()
					except Exception:
						temp_sess.rollback()
						traceback.print_exc()
Exemplo n.º 25
0
def exposed_underp_rss_functions():
	'''
	Do stupid fixes to the RSS database.
	'''
	bad = '''buildReleaseMessage('''
	good = '''buildReleaseMessageWithType('''

	with db.session_context() as sess:
		rows = sess.query(db.RssFeedEntry).all()
		for row in rows:
			if bad in row.func:
				row.func = row.func.replace(bad, good)
				print(row)
				print(row.func)
		sess.commit()
Exemplo n.º 26
0
def exposed_delete_gravitytales_bot_blocked_pages():
    '''
	Delete the "checking you're not a bot" garbage pages
	that sometimes get through the gravitytales scraper.
	'''
    with db.session_context() as sess:
        tables = [db.WebPages.__table__, version_table(db.WebPages.__table__)]

        for ctbl in tables:
            update = ctbl.delete() \
             .where(ctbl.c.netloc == "gravitytales.com") \
             .where(ctbl.c.content.like('%<div id="bot-alert" class="alert alert-info">%'))
            print(update)
            sess.execute(update)
            sess.commit()
Exemplo n.º 27
0
def exposed_process_new_fb(fetch_title=False):
    '''
	Process items from Creative Novels/Fantasy-Books, and pull out the items
	missing in the feed lookup tool. Then, try to get the series name for each
	unique series ID.
	'''

    # This is probably broken for anyone else
    crn_feed_id = 589

    with db.session_context(override_timeout_ms=1000 * 60 * 15) as sess:

        print("Loading releases from database.")
        feed_q = sess.query(db.RssFeedPost)                                                           \
         .filter(db.RssFeedPost.published > datetime.datetime.now() - datetime.timedelta(days=30)) \
         .filter(db.RssFeedPost.feed_id == crn_feed_id)

        print("Generated query:", feed_q)
        feed_items = feed_q.all()

        print("Loaded %s items. Procesing." % (len(feed_items), ))
        items = proto_process_releases(sess, feed_items)

        print("Items:", len(items))
        out = []

        tags = {}
        for parsed, extracted in items["missed"]:
            if not extracted['tags']:
                print("No tags!")
                continue
            if len(extracted['tags']) != 1:
                print("What:", extracted)
                raise RuntimeError

            key = extracted['tags'][0]
            tags[key] = extracted

        for key, item in tags.items():
            sname, item_type = exposed_crn_series_type_from_chapter_url(
                item['linkUrl'])
            strtmp = "		('%s',                                                                   '%s',                                                                      '%s'), " % (
                key, sname, item_type)
            out.append(strtmp)

        print("Result")
        print()
        print("\n".join(out))
Exemplo n.º 28
0
    def getMapTable(self, soup):

        cssUrl = self._getFontUrl(soup)
        if not cssUrl:
            return []
        with db.session_context() as sess:
            with WebMirror.API.getPageRow(cssUrl,
                                          ignore_cache=False,
                                          session=sess) as page:
                assert page
                mimetype, fname, content = page.getResource()

        assert mimetype.lower() == "text/css"

        fonturls = self._extractCss(content)
        fontluts = self._getFontLuts(fonturls)
        return fontluts
def test():
    print("Test mode!")
    import logSetup
    import settings
    from WebMirror.Engine import SiteArchiver

    logSetup.initLogging()

    urls = [
        'https://royalroadl.com/api/fiction/updates?apiKey=' +
        settings.RRL_API_KEY,
        # 'https://royalroadl.com/api/fiction/newreleases?apiKey=' + settings.RRL_API_KEY,
    ]

    for url in urls:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            archiver.synchronousJobRequest(url, ignore_cache=True)
Exemplo n.º 30
0
def exposed_print_scheduled_jobs():
    '''

	'''
    with db.session_context() as sess:

        items = sess.execute("""
			SELECT
				id, next_run_time , job_state
			FROM
				apscheduler_jobs
		""")
        items = list(items)
        for tid, nextcall, content in items:
            print("Job: ", tid.ljust(30), str(nextcall).rjust(20))

            dat = pickle.loads(content)
            pprint.pprint(dat)