def main(): global args logging.debug("Running main() of file: %r." % __file__) (inconn,incursor) = differ_library.db_conn_from_config_path(args.config) #Fetch work queue if args.tails_only: #This query is as in check_tarball_is_sequence_end.py, r574. query = """ SELECT storage.* FROM diskprint.slice AS slice, diskprint.storage AS storage WHERE storage.slicehash = slice.slicehash AND (slice.osetid, slice.appetid, slice.sliceid) NOT IN ( SELECT s1.osetid, s1.appetid, s1.slicepredecessorid FROM diskprint.slice AS s1, diskprint.slice AS s2 WHERE s1.osetid = s2.osetid AND s1.appetid = s2.appetid AND s1.slicepredecessorid = s2.sliceid ) ; """ else: query = """ SELECT pq.* FROM diskprint.processqueue AS pq, diskprint.slice AS s WHERE s.slicehash = pq.slicehash ORDER BY sliceid DESC ; """ incursor.execute(query) logging.info("Diskprint table Query: %s" % query) inrows = [row for row in incursor] any_found = 0 for inrow in inrows: any_found += 1 inrow_location = inrow["location"] logging.debug("PROCESS QUEUE LOCATION %s" % inrow_location) print(inrow_location) if any_found == 0: logging.info("No diskprints to process.") #Cleanup inconn.close()
def main(): filemetadata_out = open("filemetadata.sql", "w") md5_out = open("md5.sql", "w") (conn, cursor) = differ_library.db_conn_from_config_path(args.config) #Get slice hash cursor.execute("SELECT slicehash FROM diskprint.storage WHERE location = %s", (args.slice_path,)) inrows = [row for row in cursor] if len(inrows) != 1: logging.error("Could not find diskprint from tarball path: %r." % args.slice_path) sys.exit(1) slicehash = inrows[0]["slicehash"] def process_fi(fi): """ Produce SQL records for every allocated file. (This is an inline function so the value of 'slicehash' is in scope.) """ #Only allocated, regular files if not fi.allocated(): return if fi.name_type() != "r": return #Build SQL templates md5_insert_template = "insert into diskprint.MD5 values ('%(keyhash)s','%(keyhash_md5)s');\n" filemetadata_insert_template = "insert into diskprint.filemetadata (keyhash, slicehash, path, filename, extension, bytes, mtime, ctime) values ('%(keyhash)s','%(slicehash)s','%(path)s','%(filename)s','%(extension)s',%(bytes)d,'%(mtime)s','%(ctime)s');\n" #Build SQL values as substitution dictionary d = dict() d["keyhash"] = fi.sha1() d["keyhash_md5"] = fi.md5() d["slicehash"] = slicehash d["path"] = fi.filename() d["filename"] = os.path.basename(fi.filename()) d["extension"] = os.path.splitext(fi.filename())[1] d["bytes"] = fi.filesize() d["mtime"] = fi.mtime() d["ctime"] = fi.crtime() #TODO What does this table actually mean by ctime? Change, or create? #Output filemetadata_out.write(filemetadata_insert_template % d) md5_out.write(md5_insert_template % d) #Begin loop through XML dfxml.read_dfxml(xmlfile=open(args.fiwalk_xml, "rb"), callback=process_fi)
def main(): global args (inconn,incursor) = differ_library.db_conn_from_config_path(args.config) incursor.execute(""" SELECT COUNT(*) AS tally FROM diskprint.storage WHERE location = %s ; """, (args.slice_path,)) inrows = [row for row in incursor] if len(inrows) != 1: raise Exception("Could not find tarball path in diskprint.storage table: %r." % args.slice_path) incursor.execute(""" SELECT storage.* FROM diskprint.slice AS slice, diskprint.storage AS storage WHERE storage.location = %s AND storage.slicehash = slice.slicehash AND (slice.osetid, slice.appetid, slice.sliceid) NOT IN ( SELECT s1.osetid, s1.appetid, s1.slicepredecessorid FROM diskprint.slice AS s1, diskprint.slice AS s2 WHERE s1.osetid = s2.osetid AND s1.appetid = s2.appetid AND s1.slicepredecessorid = s2.sliceid ) ; """, (args.slice_path,)) inrows = [row for row in incursor] if len(inrows) != 1: logging.error("Did not retrieve a single vetted tarball path; got %r instead. Concluding this is not a sequence end." % len(inrows)) sys.exit(1)
def main(): global args (outconn, outcursor) = differ_library.db_conn_from_config_path(args.config) inconn = sqlite3.connect(args.inputsqlite) inconn.row_factory = sqlite3.Row incursor = inconn.cursor() #Get translation dictionary for cell actions outcursor.execute("SELECT * FROM diskprint.cell;") action_text_to_id = dict() for outrow in outcursor: action_text_to_id[outrow["actiontype"]] = outrow["actionid"] logging.debug("Cell action translation dictionary:\n\t%r" % action_text_to_id) #Populate Hive table and get new ID's assigned by auto-incrementor #The insert-then-commit operation should guarantee Postgres is doling out unique IDs. in_to_out_hiveid = dict() for inrow in incursor.execute("SELECT * FROM hive;"): indict = dict() outdict = dict() for k in inrow.keys(): indict[k] = inrow[k] for k in ["hivepath", "appetid", "osetid"]: outdict[k] = inrow[k] #Inline function for maybe-once repetition def fetch(od): outcursor.execute(""" SELECT * FROM diskprint.hive WHERE hivepath = %s AND appetid = %s AND osetid = %s ; """, (od["hivepath"], od["appetid"], od["osetid"])) return [row for row in outcursor] #checkrows should ultimately have just one record in it from the database, from which we get the translated ID of the hive sequence (hiveid identifies sequences) checkrows = fetch(outdict) if len(checkrows) > 1: logging.error("Could not retrieve unique record that was just inserted; something about the database state is incorrect. Expected to get 1 record, got %d." % len(checkrows)) if len(in_to_out_hiveid) > 0: logging.info("You may want to issue this query to undo changes made by this script, after looking at the table:\n\tDELETE FROM diskprint.hive WHERE hiveid IN (%s);" % ",".join(map(str, in_to_out_hiveid.values()))) exit(1) elif len(checkrows) == 0: insert_db_postgres(outcursor, "hive", outdict) outconn.commit() checkrows = fetch(outdict) in_to_out_hiveid[indict["hiveid"]] = checkrows[0]["hiveid"] logging.debug("Hive ID translation dictionary:\n\t%r" % in_to_out_hiveid) hive_ids_str = ",".join(map(str, in_to_out_hiveid.values())) logging.info("If at any point after this the script fails, you should investigate and probably delete Postgres records with these rollback queries:\n\tDELETE FROM diskprint.hive WHERE hiveid IN (%s);\n\tDELETE FROM diskprint.regdelta WHERE hiveid IN (%s);" % (hive_ids_str, hive_ids_str)) #Check for previous export of the mutation records outcursor.execute("""SELECT COUNT(*) AS tally FROM diskprint.regdelta WHERE hiveid IN (%s);""" % hive_ids_str) tallyoutrows = [row for row in outcursor] if len(tallyoutrows) != 1: logging.error("Not sure how, but a SELECT COUNT from Postgres didn't return 1 row; it returned %d. Quitting." % len(tallyoutrows)) exit(1) tally_postgres = tallyoutrows[0]["tally"] incursor.execute("""SELECT COUNT(*) AS tally FROM regdelta;""") tallyinrows = [row for row in incursor] if len(tallyinrows) != 1: logging.error("Not sure how, but a SELECT COUNT from SQLite didn't return 1 row; it returned %d. Quitting." % len(tallyinrows)) exit(1) tally_sqlite = tallyinrows[0]["tally"] if tally_postgres > 0: logging.error("Some records for the hives in this sequential analysis are already in Postgres. Here is how the count compares to the SQLite that was about to be imported:\n\tPostgres\t%d\n\tSQLite\t%d" % (tally_postgres,tally_sqlite)) logging.info("Given you invoked this script, you probably want the export to run to completion. Clear away the records by issuing the DELETE queries above in the Postgres server, and then you can re-run this script and it should complete.") exit(1) #Export the mutation records bailout = False for inrow in incursor.execute("SELECT * FROM regdelta;"): #Build output dictionary (we're about to tweak it) outdict = dict() for k in inrow.keys(): outdict[k] = inrow[k] #Translate records outdict["hiveid"] = in_to_out_hiveid.get(inrow["hiveid"]) outdict["appetid"] = inrow["appetid"] outdict["osetid"] = inrow["osetid"] outdict["sliceid"] = inrow["sliceid"] if outdict["hiveid"] is None: logging.error("Failed to translate a hiveid: %r was not found." % inrow["hiveid"]) logging.info("Hive ID translation dictionary:\n\t%r" % in_to_out_hiveid) bailout = True outdict["iskeybefore"] = inrow["iskeybefore"] == 1 outdict["iskeyafter"] = inrow["iskeyafter"] == 1 outdict["cellaction"] = action_text_to_id.get(inrow["cellaction"]) if outdict["cellaction"] is None: logging.error("Failed to translate a cell action: %r not in table 'cell'." % inrow["cellaction"]) bailout = True #Fail out if something didn't translate if bailout: logging.warning("You may want to purge the regdelta records from Postgres (this script was in the process of inserting them). See the above DELETE queries.") exit(1) #Ship it! insert_db_postgres(outcursor, "regdelta", outdict) outconn.commit()
def main(): global args if not os.path.isfile(args.slice_path): raise Exception("Invoked on non-existent path '%s'." % args.slice_path) loglvl = logging.DEBUG if args.debug else logging.INFO logging.basicConfig( format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%dT%H:%M:%SZ', level=loglvl ) (conn, cursor) = differ_library.db_conn_from_config_path(args.config) cursor.execute(""" SELECT slice.sliceid, slice.osetid, slice.appetid, storage.location FROM diskprint.storage AS storage, diskprint.slice AS slice WHERE slice.slicehash = storage.slicehash AND location = %s; """, (args.slice_path,)) returned_rows = [row for row in cursor] if len(returned_rows) != 1: raise Exception("Error: unexpected data: Expected only one sliceid to match '%s', got %d." % (args.slice_path, len(returned_rows))) slice_id = returned_rows[0]["sliceid"] osetid = returned_rows[0]["osetid"] appetid = returned_rows[0]["appetid"] #Build chain of slice_id's ids_reversed = [slice_id] earliest_id = slice_id while True: cursor.execute(""" SELECT slicepredecessorid FROM diskprint.slice AS slice WHERE osetid = %s AND appetid = %s AND sliceid = %s; """, (osetid, appetid, earliest_id)) returned_rows = [row for row in cursor] if len(returned_rows) == 0: raise Exception("Missing data: sliceid %d's parent not present in slice table." % earliest_id) elif len(returned_rows) > 1: raise Exception("Ambiguous history: A disk slice can't have two immediately preceding slices.\n\tsliceid: %r." % earliest_id) earliest_id = returned_rows[0]["slicepredecessorid"] if earliest_id is None: #Done with this sequence. break else: if earliest_id in ids_reversed: raise Exception("Encountered preceding-slice loop: sliceid %d already in sliceid list." % earliest_id) ids_reversed.append(earliest_id) ids_reversed.reverse() ids = ids_reversed #Look up paths of slice_id's #Fail out here if for some reason there are none: # "SQL doesn't allow an empty list in the IN operator, so your code should guard against empty tuples" <http://initd.org/psycopg/docs/usage.html#python-types-adaptation>. if len(ids) == 0: raise Exception("Somehow the ids list emptied; cannot proceed.") cursor.execute(""" SELECT slice.sliceid, storage.location FROM diskprint.storage AS storage, diskprint.slice AS slice WHERE slice.slicehash = storage.slicehash AND slice.osetid = %s AND slice.appetid = %s AND sliceid in %s; """, (osetid, appetid, tuple(ids)) ) #Build map of slice id to file system path id_to_path = dict() for row in cursor: id_to_path[int(row["sliceid"])] = row["location"] #Build list of file system paths paths = [] for id in ids: try: paths.append(id_to_path[id]) except: sys.stderr.write("Debug: Was looking up id=%d\n" % id) raise #Output paths for path in paths: print(path)