def fill_rowObj_fromFile(fullpath, rowObj, args): ''' fills rowObj with info from file ''' file = os.path.basename(fullpath) duration = rowObj['data']['duration'] loggr(duration) print(duration) if not duration: loggr("no duration in catalog, getting duration of " + file) print("no duration in catalog, getting duration of " + file) rowObj.data.duration = get_file_duration(fullpath) if "nas" in fullpath or "NAS" in fullpath: whichHash = 'SHA1hash-onRAID' else: whichHash = 'SHA1hash-ondrive' hash = rowObj.data[whichHash] if not hash: loggr("no hash in catalog, hashing " + file) print("no hash in catalog, hashing " + file) rowObj.data[whichHash] = hash_file(fullpath) if not rowObj.data[whichHash]: loggr("hash error") print("hash error") uid = rowObj.identifier if not uid: loggr( "identifier not in rowObj, attempting to locate uid in filename in mfd.fill_rowObj_fromFile()" ) print( "identifier not in rowObj, attempting to locate uid in filename in mfd.fill_rowObj_fromFile()" ) rowObj.identifier = get_uid_from_file(fullpath) return rowObj
def make_rowObj(args): ''' use make_rowObject to make a rowObj ''' rowObj, header_map = make_rowObject.init_rowObject(args) rowObj = make_rowObject.fill_rowObject_fromCatalog(rowObj, header_map, args) loggr(rowObj) return rowObj
def clean_header_row(header_row): ''' removes list objects that we don't need like nameroles ''' loggr("cleaning header_row in mro.clean_header_row()") print("cleaning header_row in mro.clean_header_row()") _header_row = [] for header in header_row: if not header == "name" and not header == "role" and not header == "hashes match?" and not header == "showHide" and not header == "identifier": _header_row.append(header) loggr("header_row_cleaning complete in mro.clean_header_row()") pprint("header_row_cleaning complete in mro.clean_header_row()") return _header_row
def get_uid_from_file(filepath): ''' extracts 5000x number from full path ''' loggr("attempting to locate uid in file name in mfd.get_uid_from_file") print("attempting to locate uid in file name in mfd.get_uid_from_file") match = '' match = re.search(r'\d{14}', filepath) if match: uid = match.group() print("uid is " + uid) return uid else: return False
def fill_rowObj_fromRow(rowObj, header_map, args): ''' if we know the row grip the row data transform into rowObj ''' loggr("filling rowObj with row" + str(rowObj.row) + " data from " + args.sheet + " in mro.fill_rowObj_fromRow()") print("filling rowObj with row" + str(rowObj.row) + " data from " + args.sheet + " in mro.fill_rowObj_fromRow()") rowData = args.worksheet.row_values(rowObj.row) for name, letter in header_map.items(): indx = ord(letter) rowObj.data[name] = rowData[indx - 65] #assigns every key in data:{} rowObj.identifier = args.worksheet.acell("A" + str(rowObj.row)).value loggr("rowObj fill from row complete") print("rowObj fill from row complete") return rowObj
def clean_header_column_map(header_column_map, header_row): ''' normalized map based on clean header info ''' loggr("cleaning header_column_map in mro.clean_header_column_map()") print("cleaning header_column_map in mro.clean_header_column_map()") header_map = dotdict({}) for header in header_row: header_map[header] = header_column_map[header] loggr( "header_column_map cleaning completed in mro.clean_header_column_map()" ) print( "header_column_map cleaning completed in mro.clean_header_column_map()" ) #loggr(header_column_map) #print(header_column_map) return header_map
def main(): ''' do the thing ''' loggr("move_data.py started at " + str(datetime.now())) print("move_data.py started at " + str(datetime.now())) args = init() args.sheet = 'catalog' if platform == "linux" or platform == "linux2": args.Dropbox = "/root/Dropbox/MF archival audio" args.traffic = "/mnt/nas/traffic" args.nas = "/mnt/nas" elif platform == "darwin": args.Dropbox = "/root/Dropbox/MF archival audio" args.traffic = "/Volumes/NAS_Public/traffic" args.nas = "/Volumes/NAS_Public" loggr(args) if args.hasher: args = gh.get_worksheet(args) hasher(args.hasher, args) if args.it or args.io: inventory_directory(args) if args.mdtt: try: moveDropboxToTraffic(args) except: loggr("moveDropboxToTraffic didn't work :(")
def inventory_directory(args): ''' send file data to catalog get dir and filename lists from catalog - convert to list of single paths generate list of single paths for traffic if not in catalog: add filename, path, assign uid ''' if args.it: args.path = args.traffic elif args.io: args.path = args.io loggr("getting list of files from " + args.path + " in md.inventory_directory()") print("getting list of files from " + args.path + " in md.inventory_directory()") for dirs, subdirs, files in os.walk(args.path): for file in files: if not file.endswith(".zip") and not file.startswith( ".") and not file.endswith(".xml"): if args.start: if int(args.start) <= int(file[:14]): process_single_file(os.path.join(dirs, file), args) loggr("sleeping for 60s for API reset") print("sleeping for 60s for API reset") time.sleep(60) else: continue else: process_single_file(os.path.join(dirs, file), args) loggr("sleeping for 60s for API reset") print("sleeping for 60s for API reset") time.sleep(60)
def hash_file(filepath): ''' uses shasum to create SHA1 hash of file ''' loggr('attempting to hash file ' + filepath) print('attempting to hash file ' + filepath) try: output = subprocess.check_output("shasum '" + filepath + "'", shell=True) except subprocess.CalledProcessError as e: return False match = '' #search for 40 consecutive word characters in string, convert byte output from shasum in CLI to utf-8 string match = re.search(r'\w{40}', output.decode("utf-8")) if match: #convert match object to string thehash = match.group() loggr("file " + os.path.basename(filepath) + " hash is " + thehash) print("file " + os.path.basename(filepath) + " hash is " + thehash) return thehash else: return False
def make_single_file_inventory(file, row, rowObj, uids, header_map, args): ''' generates catalog data for a single file ''' fullpath = os.path.join(args.path, file) rowObj.row = row + 1 rowObj.data.filename = file uidInFile = mfd.get_uid_from_file(fullpath) if uidInFile: rowObj.identifier = uidInFile else: rowObj.identifier = int(mtd.get_last_uid()) + 1 args.uid = rowObj.identifier rowObj.data['SHA1 hash - on RAID'] = mfd.hash_file(fullpath) loggr(rowObj) if not rowObj.identifier in uids: rowObj.data.duration = mfd.get_file_duration(fullpath) gh.update_cell_value("A" + str(rowObj.row), rowObj.identifier, args.worksheet) for key, value in rowObj.data.items(): gh.update_cell_value(header_map[key] + str(rowObj.row), value, args.worksheet) return rowObj
def init(): ''' initialize vars ''' loggr("initializing variables") parser = argparse.ArgumentParser(description="makes the metadata ~flow~") parser.add_argument( '--moveDropboxToTraffic', dest='mdtt', action='store_true', help="moves file from Dropbox folder to traffic on NAS") parser.add_argument('--inventoryTraffic', dest="it", default=False, action='store_true', help="send file data from traffic to catalog") parser.add_argument( '--inventoryOther', dest='io', default=False, help="the top-level path that you would like to inventory") parser.add_argument( '--overwriteOK', dest='ook', action='store_true', default=False, help='allow re-upload of catalog data for existing entries') parser.add_argument('--start', dest='start', type=int, default=0, help="the starting row number number") parser.add_argument('--hasher', dest="hasher", help="hash all the files in a directory") args = parser.parse_args() return args
def get_header(args): ''' returns list of header row of given sheet ''' if not args.worksheet: loggr("getting ws data in mro.get_header()") print("getting ws data in mro.get_header()") worksheet = gh.get_worksheet(args) loggr("retrieving header row from " + args.sheet + " in mro.get_header()") print("retrieving header row from " + args.sheet + " in mro.get_header()") header_row = args.worksheet.row_values(1) loggr("creating header_column_map from " + args.sheet + " in mro.get_header()") print("creating header_column_map from " + args.sheet + " in mro.get_header()") header_column_map = make_header_column_map(header_row) loggr("header_row and header_column_map created in mro.get_header") print("header_row and header_column_map created in mro.get_header") return header_row, header_column_map
def get_last_uid(args): ''' searches for highest number UID in sheets ''' loggr("getting last/ highest uid from catalog in mtd.get_last_uid()") print("getting last/ highest uid from catalog in mtd.get_last_uid()") uids = [] loggr("getting uids from " + sheet) print("getting uids from " + sheet) worksheet = args.spreadsheet.worksheet(args.sheet) _uids = worksheet.col_values(1) uids = uids + _uids[1:] last_uid = max(uids) loggr("last uid is " + str(last_uid)) print("last uid is " + str(last_uid)) return last_uid
def make_header_column_map(header_row): ''' maps headers to ABCD etc ''' loggr("initializing header_column_map in mro.make_header_column_map()") print("initializing header_column_map in mro.make_header_column_map()") header_column_map = dotdict({}) loggr("filling header_column_map in mro.make_header_column_map()") print("filling header_column_map in mro.make_header_column_map()") for header in header_row: _char = header_row.index(header) char = string.ascii_uppercase[_char] header_column_map[header] = char print(header + ":" + char) loggr("header_column_map created in mro.make_header_column_map()") print("header_column_map created in mro.make_header_column_map()") return header_column_map
def update_catalog(rowObj, catalog_rowObj, header_map, args): ''' updates catalog with data generated from file ''' loggr("updating catalog with file info in md.update_catalog()") print("updating catalog with file info in md.update_catalog()") for key, value in rowObj.data.items(): loggr(key) loggr("rowObj value: " + str(value)) if value: catalog_value = catalog_rowObj.data[key] loggr("catalog_value: " + catalog_value) if not catalog_value: loggr("no catalog value found for key " + key + ", updating catalog") print("no catalog value found for key " + key + ", updating catalog") cell = header_map[key] + str(rowObj.row) value = rowObj.data[key] loggr("updating cell " + cell + " with value " + value) print("updating cell " + cell + " with value " + value) gh.update_cell_value(cell, value, args.worksheet)
def is_file_cataloged(fullpath, args): ''' tries to find a file in the catalog returns False if file not in catalog returns rowObj if file in catalog ''' loggr("initializing rowObj and header map in mtd.is_file_cataloged()") print("initializing rowObj and header map in mtd.is_file_cataloged()") rowObj, header_map = make_rowObject.init_rowObject(args) #get blank rowObj pprint(rowObj) pprint(header_map) loggr("initializing lists of filenames, locations, rows from " + args.sheet + " in mtd.is_file_cataloged()") print("initializing lists of filenames, locations, rows from " + args.sheet + " in mtd.is_file_cataloged()") indx = ord(header_map['filename']) - 64 #get column number of filename filenames = args.worksheet.col_values( indx) #get list of filenames from spreadsheet column #filenames = filenames[1:] #remove header row from list of filename indx = ord(header_map['RAID-dir']) - 64 #get column number of dir dirs = args.worksheet.col_values( indx) #get list of directories from spreadsheet column dirs = dirs[1:] #remove header row from list of dirs uids = args.worksheet.col_values( 1) #get list of uids - whicha re always in column 1/A #uids = uids[1:] fname = os.path.basename(fullpath) #fpath = fullpath.replace(fname, "") loggr("initialization complete in mtd.is_file_cataloged()") print("initialization complete in mtd.is_file_cataloged()") if fname in filenames: loggr("file is cataloged") print("file is cataloged") rowObj.row = filenames.index( fname ) + 1 #need to add 1 here because list indexes start at 0 and rows in Sheets start at 1 loggr("row is " + str(rowObj.row)) print("row is " + str(rowObj.row)) loggr("filling rowObj with row data in mtd.is_file_cataloged()") print("filling rowObj with row data in mtd.is_file_cataloged()") rowObj = make_rowObject.fill_rowObj_fromRow(rowObj, header_map, args) loggr("rowObj full in mtd.is_file_cataloged()") print("rowObj full in mtd.is_file_cataloged()") pprint(rowObj) return rowObj, header_map else: loggr("file is not cataloged") print("file is not cataloged") return False, header_map
def moveDropboxToTraffic(args): ''' checks if file is done copying - see if Dropbox is syncing currently? verify file not on NAS via catalog hash move file in tree to /NAS_Public/traffic ''' if not os.path.exists(args.traffic): loggr("mount the NAS before continuing!", **{"level": "error"}) print("mount the NAS before continuing!") exit() loggr("args.Dropbox is " + args.Dropbox) for dirs, subdirs, files in os.walk(args.Dropbox): for file in files: if not "." in file: continue elif not ".tmp" in file and not file.startswith("."): loggr("processing file " + file) print("processing file " + file) fullpath = os.path.join(dirs, file) with cd(args.Dropbox): output = subprocess.check_output('dropbox filestatus "' + file + '"', shell=True) output = output.decode("utf-8") loggr(output) #output = "/root/Dropbox/MF archival audio/20170225_PalmDesertAct2_T585.mp3: up to date" outList = output.split(":") status = outList[1].lstrip() loggr(status) if "up to date" in status: loggr("retrieving worksheet " + args.sheet + " in md.inventory_directory()") print("retrieving worksheet " + args.sheet + " in md.inventory_directory()") args = gh.get_worksheet(args) loggr( "checking if file is cataloged in md.inventory_directory()" ) print( "checking if file is cataloged in md.inventory_directory()" ) file_is_cataloged, header_map = mtd.is_file_cataloged( os.path.join(args.Dropbox, file), args) loggr(file_is_cataloged) if not file_is_cataloged: loggr("file is not cataloged") print("file is not cataloged") loggr("copying " + file) print("copying " + file) subprocess.check_output('rsync -av --progress "' + fullpath + '" ' + args.traffic, shell=True) else: loggr("file " + file + " is cataloged") print("file " + file + " is cataloged") else: loggr("still copying " + outList[0]) print("still copying " + outList[0]) loggr("resting 30s for API reset") print("resting 30s for API reset") time.sleep(30)
def process_single_file(fullpath, args): ''' runs a single file through the process ''' loggr("processing file " + os.path.basename(fullpath)) print("processing file " + os.path.basename(fullpath)) loggr("retrieving worksheet " + args.sheet + " in md.inventory_directory()") print("retrieving worksheet " + args.sheet + " in md.inventory_directory()") args = gh.get_worksheet(args) loggr("checking if file is cataloged in md.inventory_directory()") print("checking if file is cataloged in md.inventory_directory()") file_is_cataloged, header_map = mtd.is_file_cataloged(fullpath, args) rowObj, _header_map = mtd.is_file_cataloged(fullpath, args) loggr(file_is_cataloged) pprint(file_is_cataloged) if file_is_cataloged: if rowObj.data.filedata_complete == 'FALSE': loggr("filling rowObj from filedata in md.inventory_directory()") print("filling rowObj from filedata in md.inventory_directory()") rowObj = mfd.fill_rowObj_fromFile(fullpath, rowObj, args) loggr("rowObj") loggr(rowObj) loggr("file is cataloged") print("file_is_cataloged") loggr(file_is_cataloged) if not rowObj.identifier: loggr("no identifier in catalog or filename, generating new uid") print("no identifier in catalog or filename, generating new uid") last_uid = mtd.get_last_uid(args) rowObj.identifier = str(int(last_uid) + 1) loggr("uid is " + rowObj.identifier) print("uid is " + rowObj.identifier) loggr("sending updates to catalog in md.inventory_directory()") print("sending updates to catalog in md.inventory_directory()") update_catalog(rowObj, file_is_cataloged, header_map, args)
def init_rowObject(args): ''' initalizes blank rowObject and map of header -> letter mapping, e.g. identifier:A ''' loggr("initializing rowObj in init_rowObject()") print("initializing rowObj in init_rowObject()") rowObj = dotdict({"identifier": "", "row": "", "data": {}}) loggr( "getting header_row and creating header_column_map in mro.init_rowObject()" ) print( "getting header_row and creating header_column_map in mro.init_rowObject()" ) header_row, header_column_map = get_header(args) loggr(header_row) loggr(header_column_map) loggr("cleaning header row in mro.init_rowObj()") print("cleaning header row in mro.init_rowObj()") header_row = clean_header_row(header_row) loggr(header_row) loggr("cleaning header_column_map in mro.init_rowObj()") print("cleaning header_column_map in mro.init_rowObj()") header_map = clean_header_column_map(header_column_map, header_row) loggr(header_map) #convert list to dotdict loggr("initializing rowObj.data dotdict in mro.init_rowObject()") print("initializing rowObj.data dotdict in mro.init_rowObject()") for header in header_row: rowObj.data[header] = "" rowObj.data = dotdict(rowObj.data) loggr("rowObj and header_map initialized in mro.init_rowObj") print("rowObj and header_map initialized in mro.init_rowObj") return rowObj, header_map