def get_items(prefix=None, ignores=IGNORE_PREFIXES, last_check_interval=None): """Return the FetchItems that should be processed This takes into account ITEMCLASSES and will generate the appropriate subclass based on registered prefixes. """ sql = """ SELECT url, type, mime FROM media WHERE type IS NOT NULL AND (last_status IS NULL OR (last_status >= 400 AND last_check < now() - %s::interval)) """ params = [] params.append(last_check_interval or LAST_CHECK_INTERVAL) if prefix: sql += "\n AND url LIKE %s" params.append(prefix + '%') else: for i in ignores: assert i, "Can't ignore {0!r}".format(i) sql += "\n AND url NOT LIKE %s" params.append(i + '%') sql += "\n ORDER BY url" logger.debug("Querying %r", apidbpool.mogrify(sql, params)) url_rows = apidbpool.fetchall(sql, params, cursor_factory=cursor) logger.info("Found %d urls to check", len(url_rows)) for r in url_rows: m = PREFIX_RE.search(r[0]) prefix = m and m.group() cls = ITEMCLASSES.get(prefix, FetchItem) yield cls(*r, prefix=prefix)
def objects_for_etags(etags): assert isinstance(etags, (tuple, list)) sql = """SELECT etag, bucket FROM objects WHERE derivatives=false AND etag IN %s """ return apidbpool.fetchall(sql, (etags, ), cursor_factory=NamedTupleCursor)
def find_parts_on_servers(parts_obj): # Using the index of all files on all servers, return a list of dicts with # information about that file is on disk(s) cols = ["server", "fullname", "filename", "size"] q = """SELECT {} FROM ceph_server_files WHERE filename LIKE %s """.format(','.join(cols)) for i, part in enumerate(parts_obj): logger.debug("Looking up filenames for {0}".format(part["pattern"])) # When in doubt, add more backslashes! rows = apidbpool.fetchall( q, ("{0}%".format(part["pattern"].replace('\\', '\\\\')), )) copies = [] for c in rows: copies.append(dict(zip(cols, c))) parts_obj[i]["copies"] = copies return parts_obj
def getitems(): sql = """SELECT objects.bucket, objects.etag, objects.detected_mime as mime FROM objects JOIN media_objects USING (etag) WHERE media_objects.modified > '2016-08-01' AND derivatives = false """ return set(apidbpool.fetchall(sql, cursor_factory=cursor))
def objects_for_buckets(buckets): assert isinstance(buckets, (tuple, list)) sql = """SELECT etag, bucket FROM objects WHERE derivatives=false AND bucket IN %s ORDER BY random() """ return apidbpool.fetchall(sql, (buckets, ), cursor_factory=NamedTupleCursor)
def get_paused_rsids(): sql = """ SELECT uuid FROM recordsets WHERE ingest=true AND uuid IS NOT NULL AND ingest_is_paused = true """ params = [] return [ r[0] for r in apidbpool.fetchall(sql, params, cursor_factory=cursor) ]
def uuidsIter(uuid_l, ei, rc, typ, yield_record=False, children=False): for rid in uuid_l: if children: logger.debug("Selecting children of %s.", rid) sql = "SELECT * FROM idigbio_uuids_data WHERE parent=%s and type=%s" else: sql = "SELECT * FROM idigbio_uuids_data WHERE uuid=%s and type=%s" params = (rid.strip(), typ[:-1]) results = apidbpool.fetchall(sql, params, cursor_factory=DictCursor) for rec in results: if yield_record: yield rec else: yield index_record(ei, rc, typ, rec, do_index=False)
def get_active_rsids(since=None): sql = """ SELECT uuid FROM recordsets WHERE ingest=true AND uuid IS NOT NULL AND file_harvest_date IS NOT NULL """ params = [] if since: sql += "AND file_harvest_date >= %s" params.append(since) sql += "ORDER BY file_harvest_date DESC" return [ r[0] for r in apidbpool.fetchall(sql, params, cursor_factory=cursor) ]
def get_row_objs_from_db(args): """Get a list of objects to reconstruct from the database. Uses the user's arguments to build the query for which objects to verify. """ cols = [ "ceph_bucket", "ceph_name", "ceph_date", "ceph_bytes", "ceph_etag", "ver_status", "ver_last_success", "ver_last_failure" ] wheres = [] wheres.append("length(ceph_name)>=10") #wheres.append("ceph_bytes IS NOT NULL") # for initial testing #wheres.append("ceph_date IS NULL") # for testing date updates if args["start"]: wheres.append("ceph_date>=%(start)s") if args["end"]: wheres.append("ceph_date<=%(end)s") if args["name"]: wheres.append("ceph_name like %(name)s") if args["bucket"]: wheres.append("ceph_bucket=%(bucket)s") if args["verify"]: wheres.append("ver_status=%(verify)s") else: wheres.append("ver_status='timeout'") if args["rereconstruct"]: wheres.append("rest_status=%(rereconstruct)s") else: wheres.append("rest_status IS NULL") rows = apidbpool.fetchall( """SELECT {0} FROM ceph_objects WHERE {1} LIMIT %(count)s""".format( ','.join(cols), ' AND '.join(wheres)), args) row_objs = [] for row in rows: row_objs.append(dict(zip(cols, row))) logger.info("Found {0} objects to work on".format(len(row_objs))) return row_objs
def backfill_flagged_etags(prefix): """Update ceph_objects_temp etags where records have a flag of any kind """ table = make_temp_table_name(prefix) logger.info( "Backfilling etags on new/changed records in {0}".format(table)) cols = ["ceph_bucket", "ceph_name"] results = apidbpool.fetchall("""SELECT {0} FROM {1} WHERE ceph_status IS NOT NULL """.format(','.join(cols), table)) row_objs = [] # Convert to something we can use with pool.imap_unordered for row in results: # pack prefix in so we know what temp table the works goes with row_objs.append(dict(zip(cols + ["prefix"], row + [prefix]))) # Found that batching up rows saves a bit of CPU time rather than greenlet switching and commiting each row pools = 3 batches = int(max(math.floor(len(row_objs) / 5000), pools)) work = batch_work(row_objs, batches) p = pool.Pool(pools) results = p.imap_unordered(backfill_flagged_worker, work) return sum(results)
def geturls(etag): sql = """SELECT DISTINCT url FROM media_objects WHERE etag LIKE %s""" return set(u[0] for u in apidbpool.fetchall(sql, (etag,), cursor_factory=cursor))