def db_fiddle(): print("Fixing DB things.") print("Getting IDs") have = db.get_db_session().execute(""" SELECT id FROM web_pages WHERE url LIKE 'https://www.wattpad.com/story%' AND state != 'new'; """) print("Query executed. Fetching results") have = list(have) print(len(have)) count = 0 chunk = [] for item, in have: chunk.append(item) count += 1 if count % 1000 == 0: statement = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.state != 'new') \ .filter(db.WebPages.id.in_(chunk)) # statement = db.get_db_session().update(db.WebPages) statement.update({db.WebPages.state : 'new'}, synchronize_session=False) chunk = [] print(count, item) db.get_db_session().commit()
def clear_bad(): from sqlalchemy.dialects import postgresql rules = WebMirror.rules.load_rules() for ruleset in rules: print("Cleaning ruleset") # print(ruleset['netlocs']) # print(ruleset.keys()) for badword in ruleset['badwords']: if not ruleset['netlocs']: continue if "%" in badword: print(badword) else: print("Deleting items containing string: '%s'" % badword) q = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) items = q.count() if items: print("%s results for : '%s'" % (items, badword)) q = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .delete(synchronize_session=False) db.get_db_session().commit()
def update_feed_names(): for key, value in feedNameLut.mapper.items(): feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == key) \ .all() if feed_items: for item in feed_items: item.srcname = value print(len(feed_items)) print(key, value) db.get_db_session().commit()
def initializeStartUrls(rules): print("Initializing all start URLs in the database") sess = db.get_db_session() for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = sess.query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url=starturl, starturl=starturl, netloc=netloc, type=ruleset['type'], priority=db.DB_IDLE_PRIORITY, distance=db.DB_DEFAULT_DIST, normal_fetch_mode=ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) sess.add(new) try: sess.commit() except sqlalchemy.SQLAlchemyError: print("Failure inserting start url for address: '{}'".format( starturl)) sess.rollback() sess.close() db.delete_db_session()
def longest_rows(): print("Getting longest rows from database") have = db.get_db_session().execute(""" SELECT id, url, length(content), content FROM web_pages ORDER BY LENGTH(content) DESC NULLS LAST LIMIT 50; """) print("Rows:") import os import os.path savepath = "./large_files/" for row in have: print(row[0], row[1]) try: os.makedirs(savepath) except FileExistsError: pass with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp: urlst = "URL: %s\n\n" % row[1] size = "Length: %s\n\n" % row[2] fp.write(urlst.encode("utf-8")) fp.write(size.encode("utf-8")) fp.write("{}".format(row[3]).encode("utf-8"))
def delete_comment_feed_items(): sess = db.get_db_session() bad = sess.query(db.FeedItems) \ .filter(or_( db.FeedItems.contenturl.like("%#comment-%"), db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"), db.FeedItems.contenturl.like("%www.fanfiction.net%"), db.FeedItems.contenturl.like("%www.fictionpress.com%"), db.FeedItems.contenturl.like("%www.booksie.com%"))) \ .order_by(db.FeedItems.contenturl) \ .all() count = 0 for bad in bad: print(bad.contenturl) while bad.author: bad.author.pop() while bad.tags: bad.tags.pop() sess.delete(bad) count += 1 if count % 1000 == 0: print("Committing at %s" % count) sess.commit() print("Done. Committing...") sess.commit()
def doCall(self): self.log.info("Calling job %s", self.job_name) session = db.get_db_session() item = session.query(db.PluginStatus).filter( db.PluginStatus.plugin_name == self.job_name).one() if item.is_running: session.commit() self.log.error( "Plugin %s is already running! Not doing re-entrant call!", self.job_name) return item.is_running = True item.last_run = datetime.datetime.now() session.commit() try: self._doCall() except Exception: item.last_error = datetime.datetime.now() item.last_error_msg = traceback.format_exc() raise finally: item2 = session.query(db.PluginStatus).filter( db.PluginStatus.plugin_name == self.job_name).one() item2.is_running = False item2.last_run_end = datetime.datetime.now() session.commit() db.delete_db_session() self.log.info("Job %s complete.", self.job_name)
def __init__(self, job_name): if not job_name in CALLABLE_LUT: raise JobNameException( "Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT)) self.runModule = CALLABLE_LUT[job_name] self.job_name = job_name session = db.get_db_session() try: query = session.query(db.PluginStatus).filter( db.PluginStatus.plugin_name == job_name) have = query.scalar() if not have: new = db.PluginStatus(plugin_name=job_name) session.add(new) session.commit() except sqlalchemy.exc.OperationalError: session.rollback() except sqlalchemy.exc.InvalidRequestError: session.rollback() finally: db.delete_db_session()
def resetRunStates(): print("JobSetup call resetting run-states!") session = db.get_db_session() session.query(db.PluginStatus).update({db.PluginStatus.is_running: False}) session.commit() db.delete_db_session() print("Run-states reset.")
def test_retrieve(url, debug=True, rss_debug=False): # try: # WebMirror.SpecialCase.startAmqpFetcher() # except RuntimeError: # Fetcher already started # pass if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) try: archiver = SiteArchiver(None, db.get_db_session(), None) job = archiver.synchronousJobRequest(url, ignore_cache=True) except Exception as e: traceback.print_exc() finally: db.delete_db_session()
def __init__(self, connect=True): input_settings = { 'RABBIT_LOGIN' : settings.NU_RABBIT_LOGIN, 'RABBIT_PASWD' : settings.NU_RABBIT_PASWD, 'RABBIT_SRVER' : settings.NU_RABBIT_SRVER, 'RABBIT_VHOST' : settings.NU_RABBIT_VHOST, 'synchronous' : False, 'prefetch' : 1, 'master' : True, 'taskq_task' : 'nuresponse.master.q', 'taskq_response' : 'nureleases.master.q', 'poll_rate' : 1.0 / 25, } if connect: self.data_in = WebMirror.OutputFilters.AmqpInterface.RabbitQueueHandler(input_settings) # output_settings = { # 'RABBIT_LOGIN' : settings.RABBIT_LOGIN, # 'RABBIT_PASWD' : settings.RABBIT_PASWD, # 'RABBIT_SRVER' : settings.RABBIT_SRVER, # 'RABBIT_VHOST' : settings.RABBIT_VHOST, # 'taskq_task' : 'task.master.q', # 'taskq_response' : 'response.master.q', # } self.name_lut, self.group_lut = load_lut() super().__init__(db_sess = db.get_db_session(postfix='nu_forwarder'), connect=connect)
def fix_null(): step = 50000 end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") end = list(end)[0][0] start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") start = list(start)[0][0] changed = 0 if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_db_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount)) changed += have.rowcount if changed > 10000: print("Committing (%s changed rows)...." % changed, end=' ') db.get_db_session().commit() print("done") changed = 0 db.get_db_session().commit()
def clear_blocked(): for ruleset in WebMirror.rules.load_rules(): if ruleset['netlocs'] and ruleset['badwords']: # mask = [db.WebPages.url.like("%{}%".format(tmp)) for tmp in ruleset['badwords'] if not "%" in tmp] for badword in ruleset['badwords']: feed_items = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .count() if feed_items: print("Have: ", feed_items, badword) feed_items = db.get_db_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .delete(synchronize_session=False) db.get_db_session().expire_all() else: print("Empty: ", feed_items, badword)
def fix_tsv(): step = 1000 print("Determining extents that need to be changed.") start = db.get_db_session().execute("""SELECT MIN(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL AND id != 60903982;""") start = list(start)[0][0] end = db.get_db_session().execute("""SELECT MAX(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL;""") end = list(end)[0][0] changed = 0 print("Start: ", start) print("End: ", end) if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): try: # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_db_session().execute("""UPDATE web_pages SET tsv_content = to_tsvector(coalesce(content)) WHERE tsv_content IS NULL AND content IS NOT NULL AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %10i, %7.4f, %6i' % (x, end, (x-start)/(end-start) * 100, have.rowcount)) changed += have.rowcount if changed > step: print("Committing (%s changed rows)...." % changed, end=' ') db.get_db_session().commit() print("done") changed = 0 except sqlalchemy.exc.OperationalError: db.get_db_session().rollback() print("Error!") traceback.print_exc() db.get_db_session().commit()
def resetInProgress(): print("Resetting any stalled downloads from the previous session.") sess = db.get_db_session() sess.query(db.WebPages) \ .filter( (db.WebPages.state == "fetching") | (db.WebPages.state == "processing") | (db.WebPages.state == "specialty_deferred") | (db.WebPages.state == "specialty_ready") ) \ .update({db.WebPages.state : "new"}) sess.commit() sess.close() db.delete_db_session()
def get_times(self): conn = database.get_db_session() aps = conn.execute("SELECT job_state FROM apscheduler_jobs;") update_times = [] for blob, in aps: job_dict = pickle.loads(blob) update_times.append( (job_dict['id'], job_dict['next_run_time'].isoformat())) data = { "update-times": update_times, } database.delete_db_session() return pack_message("system-update-times", data)
def clean_files(self): session = db.get_db_session() q = session.query(db.WebFiles) \ .filter(db.WebFiles.fspath != None) self.log.info("Querying for non-null filepaths...") have = q.all() self.log.info("Have %s local files.", len(have)) count = 0 for file in have: fpath = os.path.join(settings.RESOURCE_DIR, file.fspath) if not os.path.exists(fpath): self.log.error("Missing file: %s", fpath) count += 1 if count % 1000 == 0: self.log.info("Scanned %s files.", count)
def consolidate_history(): sess = db.get_db_session() print("Doing select") end = sess.execute(""" SELECT count(*), url FROM web_pages_version GROUP BY url HAVING COUNT(*) > 1 LIMIT 40 """) print("Wut?") print(end)
def do_task(self): db_handle = db.get_db_session() hadjob = False try: self.archiver = WebMirror.Engine.SiteArchiver( self.cookie_lock, new_job_queue=self.new_job_queue, response_queue=self.resp_queue, db_interface=db_handle) hadjob = self.archiver.taskProcess() finally: # Clear out the sqlalchemy state db_handle.expunge_all() db.delete_db_session() return hadjob
def delete_feed(feed_name, do_delete, search_str): sess = db.get_db_session() items = sess.query(db.FeedItems) \ .filter(db.FeedItems.srcname == feed_name) \ .all() do_delete = "true" in do_delete.lower() searchitems = search_str.split("|") for item in items: itemall = " ".join([item.title] + item.tags) if all([searchstr in itemall for searchstr in searchitems]): print(itemall) if do_delete: print("Deleting item") sess.delete(item) sess.commit()
def load_items(): feed_items = db.get_db_session().query(db.FeedItems) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() with open("tests/title_test_data_two.py", 'w') as fp: fp.write("data = [\n") for row in feed_items: title = row.title try: p = TP(title) fp.write( format_row(title, p.getVolume(), p.getChapter(), p.getFragment(), p.getPostfix())) except ValueError: fp.write(format_row(title, 0, 0, 0, '')) fp.write("]\n")
def rss_db_sync(target = None, days=False, silent=False): json_file = 'rss_filter_misses-1.json' write_debug = True if silent: config.C_DO_RABBIT = False if target: config.C_DO_RABBIT = False flags.RSS_DEBUG = True write_debug = False else: try: os.unlink(json_file) except FileNotFoundError: pass import WebMirror.processor.RssProcessor parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath = "Main.RssDb", pageUrl = 'http://www.example.org', pgContent = '', type = 'application/atom+xml', transfer = False, debug_print = True, db_sess = None, write_debug = write_debug) print("Getting feed items....") if target: print("Limiting to '%s' source." % target) feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == target) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() elif days: print("RSS age override: ", days) cutoff = datetime.datetime.now() - datetime.timedelta(days=days) feed_items = db.get_db_session().query(db.FeedItems) \ .filter(db.FeedItems.published > cutoff) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() else: feed_items = db.get_db_session().query(db.FeedItems) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() print("Feed items: ", len(feed_items)) for item in feed_items: ctnt = {} ctnt['srcname'] = item.srcname ctnt['title'] = item.title ctnt['tags'] = item.tags ctnt['linkUrl'] = item.contenturl ctnt['guid'] = item.contentid ctnt['published'] = calendar.timegm(item.published.timetuple()) # Pop()ed off in processFeedData(). ctnt['contents'] = 'wat' try: parser.processFeedData(ctnt, tx_raw=False, tx_parse=not bool(days)) except ValueError: pass # print(ctnt) if target == None: sort_json(json_file)
def launch_agg(cls, agg_queue): install_pystuck() agg_db = db.get_db_session() instance = cls(agg_queue, agg_db) instance.run() instance.close()
def purge_invalid_urls(selected_netloc=None): sess = db.get_db_session() for ruleset in WebMirror.rules.load_rules(): if ( (ruleset['netlocs'] and ruleset['badwords']) and ( (ruleset['netlocs'] and ruleset['badwords'] and selected_netloc is None) or (selected_netloc != None and selected_netloc in ruleset['netlocs']) ) ): # We have to delete from the normal table before the versioning table, # because deleting from the normal table causes inserts into the versioning table # due to the change tracking triggers. count = 1 loc = and_( db.WebPages.netloc.in_(ruleset['netlocs']), or_(*(db.WebPages.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ) # print("Doing count on table ") # count = sess.query(db.WebPages) \ # .filter(or_(*opts)) \ # .count() if count == 0: print("{num} items match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(db.WebPages) \ .filter(or_(*loc)) \ .delete(synchronize_session=False) # # Do the delete from the versioning table now. ctbl = version_table(db.WebPages) loc2 = and_( ctbl.c.netloc.in_(ruleset['netlocs']), or_(*(ctbl.c.url.like("%{}%".format(badword)) for badword in ruleset['badwords'])) ) # print("Doing count on Versioning table ") # count = sess.query(ctbl) \ # .filter(or_(*opts)) \ # .count() if count == 0: print("{num} items in versioning table match badwords from file {file}. No deletion required ".format(file=ruleset['filename'], num=count)) else: print("{num} items in versioning table match badwords from file {file}. Deleting ".format(file=ruleset['filename'], num=count)) sess.query(ctbl) \ .filter(or_(*loc2)) \ .delete(synchronize_session=False) sess.commit()
def disable_wattpad(): step = 50000 print("Determining extents that need to be changed.") start = db.get_db_session().execute(""" SELECT MIN(id) FROM web_pages WHERE (netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com') ;""") start = list(start)[0][0] print("Start: ", start) end = db.get_db_session().execute(""" SELECT MAX(id) FROM web_pages WHERE (netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com') ;""") end = list(end)[0][0] changed = 0 print("End: ", end) if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): try: # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_db_session().execute(""" UPDATE web_pages SET state = 'disabled' WHERE (netloc = 'www.wattpad.com' OR netloc = 'a.wattpad.com') AND (state = 'new' OR state = 'fetching' OR state = 'processing') AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %10i, %10i, %7.4f, %6i' % (start, x, end, (x-start)/(end-start) * 100, have.rowcount)) changed += have.rowcount if changed > step / 2: print("Committing (%s changed rows)...." % changed, end=' ') db.get_db_session().commit() print("done") changed = 0 except sqlalchemy.exc.OperationalError: db.get_db_session().rollback() print("Error!") traceback.print_exc() db.get_db_session().commit()