def db_fiddle(): print("Fixing DB things.") print("Getting IDs") have = db.get_session().execute(""" SELECT id FROM web_pages WHERE url LIKE 'https://www.wattpad.com/story%' AND state != 'new'; """) print("Query executed. Fetching results") have = list(have) print(len(have)) count = 0 chunk = [] for item, in have: chunk.append(item) count += 1 if count % 1000 == 0: statement = db.get_session().query(db.WebPages) \ .filter(db.WebPages.state != 'new') \ .filter(db.WebPages.id.in_(chunk)) # statement = db.get_session().update(db.WebPages) statement.update({db.WebPages.state : 'new'}, synchronize_session=False) chunk = [] print(count, item) db.get_session().commit()
def do_link_batch_update(self): if not self.batched_links: return self.log.info("Inserting %s items into DB in batch.", len(self.batched_links)) while 1: try: cmd = text(""" INSERT INTO web_pages (url, starturl, netloc, distance, is_text, priority, type, fetchtime, state) VALUES (:url, :starturl, :netloc, :distance, :is_text, :priority, :type, :fetchtime, :state) ON CONFLICT DO NOTHING """) for paramset in self.batched_links: db.get_session().execute(cmd, params=paramset) db.get_session().commit() self.batched_links = [] break except KeyboardInterrupt: self.log.info("Keyboard Interrupt?") db.get_session().rollback() except sqlalchemy.exc.InternalError: self.log.info("Transaction error. Retrying.") traceback.print_exc() db.get_session().rollback() except sqlalchemy.exc.OperationalError: self.log.info("Transaction error. Retrying.") traceback.print_exc() db.get_session().rollback()
def clear_blocked(): for ruleset in WebMirror.rules.load_rules(): if ruleset['netlocs'] and ruleset['badwords']: # mask = [db.WebPages.url.like("%{}%".format(tmp)) for tmp in ruleset['badwords'] if not "%" in tmp] for badword in ruleset['badwords']: feed_items = db.get_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .count() if feed_items: print("Have: ", feed_items, badword) feed_items = db.get_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .delete(synchronize_session=False) db.get_session().expire_all() else: print("Empty: ", feed_items, badword) # print(mask) # print(ruleset['netlocs']) # print(ruleset['badwords']) pass
def clear_bad(): from sqlalchemy.dialects import postgresql rules = WebMirror.rules.load_rules() for ruleset in rules: # print(ruleset['netlocs']) # print(ruleset.keys()) for badword in ruleset['badwords']: if not ruleset['netlocs']: continue if "%" in badword: print(badword) else: q = db.get_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) items = q.count() if items: print("%s results for : '%s'" % (items, badword)) q = db.get_session().query(db.WebPages) \ .filter(db.WebPages.netloc.in_(ruleset['netlocs'])) \ .filter(db.WebPages.url.like("%{}%".format(badword))) \ .delete(synchronize_session=False) db.get_session().commit()
def resetInProgress(): print("Resetting any stalled downloads from the previous session.") # db.get_session().begin() db.get_session().query(db.WebPages) \ .filter((db.WebPages.state == "fetching") | (db.WebPages.state == "processing")) \ .update({db.WebPages.state : "new"}) db.get_session().commit()
def update_feed_names(): for key, value in feedNameLut.mapper.items(): feed_items = db.get_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == key) \ .all() if feed_items: for item in feed_items: item.srcname = value print(len(feed_items)) print(key, value) db.get_session().commit()
def doCall(self): session = db.get_session() item = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==self.job_name).one() if item.is_running: session.commit() self.log.error("Plugin %s is already running! Not doing re-entrant call!", self.job_name) return item.is_running = True item.last_run = datetime.datetime.now() session.commit() try: self._doCall() except Exception: item.last_error = datetime.datetime.now() item.last_error_msg = traceback.format_exc() raise finally: item.is_running = False item.last_run_end = datetime.datetime.now() session.commit() db.delete_session()
def delete_comment_feed_items(): sess = db.get_session() bad = sess.query(db.FeedItems) \ .filter(or_( db.FeedItems.contenturl.like("%#comment-%"), db.FeedItems.contenturl.like("%CommentsForInMyDaydreams%"), db.FeedItems.contenturl.like("%www.fanfiction.net%"), db.FeedItems.contenturl.like("%www.fictionpress.com%"), db.FeedItems.contenturl.like("%www.booksie.com%"))) \ .order_by(db.FeedItems.contenturl) \ .all() count = 0 for bad in bad: print(bad.contenturl) while bad.author: bad.author.pop() while bad.tags: bad.tags.pop() sess.delete(bad) count += 1 if count % 1000 == 0: print("Committing at %s" % count) sess.commit() print("Done. Committing...") sess.commit()
def longest_rows(): print("Getting longest rows from database") have = db.get_session().execute(""" SELECT id, url, length(content), content FROM web_pages ORDER BY LENGTH(content) DESC NULLS LAST LIMIT 50; """) print("Rows:") import os import os.path savepath = "./large_files/" for row in have: print(row[0], row[1]) try: os.makedirs(savepath) except FileExistsError: pass with open(os.path.join(savepath, "file %s.txt" % row[0]), "wb") as fp: urlst = "URL: %s\n\n" % row[1] size = "Length: %s\n\n" % row[2] fp.write(urlst.encode("utf-8")) fp.write(size.encode("utf-8")) fp.write("{}".format(row[3]).encode("utf-8"))
def fix_null(): step = 50000 end = db.get_session().execute("""SELECT MAX(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") end = list(end)[0][0] start = db.get_session().execute("""SELECT MIN(id) FROM web_pages WHERE ignoreuntiltime IS NULL;""") start = list(start)[0][0] changed = 0 if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_session().execute("""UPDATE web_pages SET ignoreuntiltime = 'epoch'::timestamp WHERE ignoreuntiltime IS NULL AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %7.4f, %6i' % (x, x/end * 100, have.rowcount)) changed += have.rowcount if changed > 10000: print("Committing (%s changed rows)...." % changed, end=' ') db.get_session().commit() print("done") changed = 0 db.get_session().commit()
def getDistinctNetlocs(): query = database.get_session().query(database.WebPages) \ .filter(database.WebPages.is_text == True) \ .filter(database.WebPages.file == None) \ .filter(database.WebPages.state == 'complete') \ .distinct(database.WebPages.netloc) \ .with_entities(database.WebPages.id, database.WebPages.netloc) # print(query) vals = query.all() return vals
def handleRateLimiting(params, job, engine): allowable = params[0] with FETCH_LOCK: if not job.netloc in ACTIVE_FETCHES: ACTIVE_FETCHES[job.netloc] = 0 log.info("Active fetchers for domain %s - %s", job.netloc, ACTIVE_FETCHES[job.netloc]) if ACTIVE_FETCHES[job.netloc] > allowable: log.info("Too many instances of fetchers for domain %s active. Forcing requests to sleep for a while", job.netloc) job.ignoreuntiltime = datetime.datetime.now() + datetime.timedelta(seconds=60*5 + random.randrange(0, 60*5)) db.get_session().commit() return else: with FETCH_LOCK: ACTIVE_FETCHES[job.netloc] += 1 engine.do_job(job) time.sleep(5) with FETCH_LOCK: ACTIVE_FETCHES[job.netloc] -= 1
def status_view(): session = db.get_session() # session.expire() tasks = get_scheduled_tasks(session) states = session.query(db.PluginStatus).all() session.commit() return render_template('status.html', tasks = tasks, states = states, )
def initializeStartUrls(rules): print("Initializing all start URLs in the database") for ruleset in [rset for rset in rules if rset['starturls']]: for starturl in ruleset['starturls']: have = db.get_session().query(db.WebPages) \ .filter(db.WebPages.url == starturl) \ .count() if not have: netloc = urlFuncs.getNetLoc(starturl) new = db.WebPages( url = starturl, starturl = starturl, netloc = netloc, type = ruleset['type'], priority = db.DB_IDLE_PRIORITY, distance = db.DB_DEFAULT_DIST, normal_fetch_mode = ruleset['normal_fetch_mode'], ) print("Missing start-url for address: '{}'".format(starturl)) db.get_session().add(new) db.get_session().commit()
def fix_tsv(): step = 1000 print("Determining extents that need to be changed.") start = db.get_session().execute("""SELECT MIN(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL AND id != 60903982;""") start = list(start)[0][0] end = db.get_session().execute("""SELECT MAX(id) FROM web_pages WHERE tsv_content IS NULL AND content IS NOT NULL;""") end = list(end)[0][0] changed = 0 print("Start: ", start) print("End: ", end) if not start: print("No null rows to fix!") return start = start - (start % step) for x in range(start, end, step): try: # SQL String munging! I'm a bad person! # Only done because I can't easily find how to make sqlalchemy # bind parameters ignore the postgres specific cast # The id range forces the query planner to use a much smarter approach which is much more performant for small numbers of updates have = db.get_session().execute("""UPDATE web_pages SET tsv_content = to_tsvector(coalesce(content)) WHERE tsv_content IS NULL AND content IS NOT NULL AND id < %s AND id >= %s;""" % (x, x-step)) # print() print('%10i, %10i, %7.4f, %6i' % (x, end, (x-start)/(end-start) * 100, have.rowcount)) changed += have.rowcount if changed > step: print("Committing (%s changed rows)...." % changed, end=' ') db.get_session().commit() print("done") changed = 0 except sqlalchemy.exc.OperationalError: db.get_session().rollback() print("Error!") traceback.print_exc() db.get_session().commit()
def renderFeedEntry(postid): post = db.get_session().query(db.FeedItems) \ .filter(db.FeedItems.id == postid) \ .scalar() # relink the feed contents. contents = WebMirror.API.replace_links(post.contents) return render_template('rss-pages/post.html', entry = post, contents = contents )
def renderFeedsSourceTable(source, page=1): feeds = db.get_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == source) \ .order_by(desc(db.FeedItems.published)) if feeds is None: flash(gettext('No feeds? Something is /probably/ broken!.')) return redirect(url_for('renderFeedsTable')) feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE']) return render_template('rss-pages/feeds.html', subheader = "Source = '%s'" % source, sequence_item = feed_entries, page = page )
def fetch_content(query_text, column, page, sources=None): session = db.get_session() tsq = build_tsquery(query_text) if column == db.WebPages.title: query = session \ .query(db.WebPages, func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq))) \ .filter( \ func.to_tsvector("english", column).match(tsq, postgresql_regconfig='english') \ ) \ .order_by(func.ts_rank_cd(func.to_tsvector("english", column), func.to_tsquery(tsq)).desc()) elif column == db.WebPages.tsv_content: query = session \ .query(db.WebPages, func.ts_rank_cd(column, func.to_tsquery(tsq))) \ .filter( column.match(tsq) ) \ .order_by(func.ts_rank_cd(column, func.to_tsquery(tsq)).desc()) if sources: query = query.filter(db.WebPages.netloc.in_(sources)) else: raise ValueError("Wat?") # print(str(query.statement.compile(dialect=postgresql.dialect()))) # print("param: '%s'" % tsq) try: entries = paginate(query, page, per_page=50) except sqlalchemy.exc.ProgrammingError: traceback.print_exc() print("ProgrammingError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.InternalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise except sqlalchemy.exc.OperationalError: traceback.print_exc() print("InternalError - Rolling back!") db.get_session().rollback() raise return entries
def getByNetlocPrefix(netloc, prefix): # print("Netloc prefixes") query = database.get_session().query(database.WebPages) \ .filter(database.WebPages.is_text == True) \ .filter(database.WebPages.title.like("{}%".format(prefix))) \ .filter(database.WebPages.netloc == netloc) \ .order_by(database.WebPages.title) \ .with_entities(database.WebPages.id, database.WebPages.url, database.WebPages.title) # .filter(database.WebPages.file == None) \ # .filter(database.WebPages.state == 'complete') \ # .group_by(substr_chunk) \ vals = query.all() return vals
def clean_files(self): session = db.get_session() q = session.query(db.WebFiles) \ .filter(db.WebFiles.fspath != None) self.log.info("Querying for non-null filepaths...") have = q.all() self.log.info("Have %s local files.", len(have)) count = 0 for file in have: fpath = os.path.join(settings.RESOURCE_DIR, file.fspath) if not os.path.exists(fpath): self.log.error("Missing file: %s", fpath) count += 1 if count % 1000 == 0: self.log.info("Scanned %s files.", count)
def renderFeedsTagTable(tag, page=1): query = db.get_session().query(db.FeedItems) # query = query.join(db.Tags) query = query.filter(db.FeedItems.tags.contains(tag)) query = query.order_by(desc(db.FeedItems.published)) feeds = query if feeds is None: flash(gettext('No feeds? Something is /probably/ broken!.')) return redirect(url_for('renderFeedsTable')) feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE']) return render_template('rss-pages/feeds.html', subheader = "Tag = '%s'" % tag, sequence_item = feed_entries, page = page )
def scheduleJobs(sched, timeToStart): jobs = [] offset = 0 for key, value in activePlugins.scrapePlugins.items(): baseModule, interval = value jobs.append((key, baseModule, interval, timeToStart+datetime.timedelta(seconds=60*offset))) offset += 1 activeJobs = [] print("JobCaller: ", JobCaller) print("JobCaller.callMod: ", JobCaller.callMod) for jobId, callee, interval, startWhen in jobs: jId = callee.__name__ print("JobID = ", jId) activeJobs.append(jId) if not sched.get_job(jId): sched.add_job(do_call, args = (callee.__name__, ), trigger = 'interval', seconds = interval, start_date = startWhen, id = jId, max_instances = 1, replace_existing = True, jobstore = "main_jobstore", misfire_grace_time = 2**30) for job in sched.get_jobs('main_jobstore'): if not job.id in activeJobs: sched.remove_job(job.id, 'main_jobstore') print("JobSetup call resetting run-states!") session = db.get_session() session.query(db.PluginStatus).update({db.PluginStatus.is_running : False}) session.commit() db.delete_session() print("Run-states reset.")
def getNetlocPrefixes(netloc, length=2): # print("Netloc prefixes") # SELECT DISTINCT(substring(title for {len})) FROM book_items WHERE lower(title) LIKE %s AND src=%s; # So... the .distinct() operator on a query apparently cannot take a function # as a parameter, because stupid or something. As such, we hack the query we # want together as a return entity substr_chunk = func.substring(func.upper(database.WebPages.title), 0, length) # distinct_hack = func.distinct(substr_chunk) query = database.get_session().query(database.WebPages) \ .filter(database.WebPages.is_text == True) \ .filter(database.WebPages.file == None) \ .filter(database.WebPages.state == 'complete') \ .filter(database.WebPages.netloc == netloc) \ .group_by(substr_chunk) \ .order_by(substr_chunk) \ .with_entities(substr_chunk, func.min(database.WebPages.id), func.min(database.WebPages.netloc)) vals = query.all() return vals
def retrigger_pages(self, releases): self.log.info("Total releases found on page: %s. Forcing retrigger of item pages.", len(releases)) session = db.get_session() for release_url in releases: while 1: try: have = session.query(db.WebPages) \ .filter(db.WebPages.url == release_url) \ .scalar() # If we don't have the page, ignore # it as the normal new-link upsert mechanism # will add it. if not have: self.log.info("New: '%s'", release_url) break # Also, don't reset if it's in-progress if have.state in ['new', 'fetching', 'processing', 'removed']: self.log.info("Skipping: '%s' (%s)", release_url, have.state) break self.log.info("Retriggering page '%s'", release_url) have.state = 'new' session.commit() break except sqlalchemy.exc.InvalidRequestError: print("InvalidRequest error!") session.rollback() traceback.print_exc() except sqlalchemy.exc.OperationalError: print("InvalidRequest error!") session.rollback() except sqlalchemy.exc.IntegrityError: print("[upsertRssItems] -> Integrity error!") traceback.print_exc() session.rollback()
def renderFeedsTable(page=1): feeds = db.get_session().query(db.FeedItems) \ .order_by(desc(db.FeedItems.published)) # feeds = feeds.options(joinedload('tags')) # feeds = feeds.options(joinedload('authors')) if feeds is None: flash(gettext('No feeds? Something is /probably/ broken!.')) return redirect(url_for('renderFeedsTable')) feed_entries = paginate(feeds, page, app.config['FEED_ITEMS_PER_PAGE']) return render_template('rss-pages/feeds.html', subheader = "", sequence_item = feed_entries, page = page )
def __init__(self, job_name): if not job_name in CALLABLE_LUT: raise JobNameException("Callable '%s' is not in the class lookup table: '%s'!" % (job_name, CALLABLE_LUT)) self.runModule = CALLABLE_LUT[job_name] self.job_name = job_name session = db.get_session() try: query = session.query(db.PluginStatus).filter(db.PluginStatus.plugin_name==job_name) have = query.scalar() if not have: new = db.PluginStatus(plugin_name=job_name) session.add(new) session.commit() except sqlalchemy.exc.OperationalError: session.rollback() except sqlalchemy.exc.InvalidRequestError: session.rollback() finally: db.delete_session()
def rss_db_sync(target = None, recent=False): json_file = 'rss_filter_misses-1.json' write_debug = True if target: config.C_DO_RABBIT = False flags.RSS_DEBUG = True write_debug = False else: try: os.unlink(json_file) except FileNotFoundError: pass import WebMirror.processor.RssProcessor parser = WebMirror.processor.RssProcessor.RssProcessor(loggerPath = "Main.RssDb", pageUrl = 'http://www.example.org', pgContent = '', type = 'application/atom+xml', transfer = False, debug_print = True, write_debug = write_debug) print("Getting feed items....") if target: print("Limiting to '%s' source." % target) feed_items = db.get_session().query(db.FeedItems) \ .filter(db.FeedItems.srcname == target) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() elif recent: cutoff = datetime.datetime.now() - datetime.timedelta(days=32) feed_items = db.get_session().query(db.FeedItems) \ .filter(db.FeedItems.published > cutoff) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() else: feed_items = db.get_session().query(db.FeedItems) \ .order_by(db.FeedItems.srcname) \ .order_by(db.FeedItems.title) \ .all() print("Feed items: ", len(feed_items)) for item in feed_items: ctnt = {} ctnt['srcname'] = item.srcname ctnt['title'] = item.title ctnt['tags'] = item.tags ctnt['linkUrl'] = item.contenturl ctnt['guid'] = item.contentid ctnt['published'] = calendar.timegm(item.published.timetuple()) # Pop()ed off in processFeedData(). ctnt['contents'] = 'wat' try: parser.processFeedData(ctnt, tx_raw=False) except ValueError: pass # print(ctnt) if target == None: sort_json(json_file)