def make_fake_s3_dir(parent, file, cliargs): """Make fake s3 directory function. Creates a fake directory doc for es. Returns dictionary for directory doc. """ global fake_dirs fullpath = os.path.abspath(os.path.join(parent, file)) if fullpath in fake_dirs: return None mtime_utc = "1970-01-01T00:00:00" mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) dir_dict = {} dir_dict['filename'] = file dir_dict['path_parent'] = parent dir_dict["filesize"] = 0 dir_dict["items"] = 1 # 1 for itself dir_dict["items_files"] = 0 dir_dict["items_subdirs"] = 0 dir_dict["last_modified"] = mtime_utc dir_dict["tag"] = "" dir_dict["tag_custom"] = "" dir_dict["indexing_date"] = datetime.utcnow().isoformat() dir_dict["worker_name"] = workername dir_dict["change_percent_filesize"] = "" dir_dict["change_percent_items"] = "" dir_dict["change_percent_items_files"] = "" dir_dict["change_percent_items_subdirs"] = "" dir_dict["_type"] = "directory" # add any autotags to inventory_dict if cliargs['autotag'] and len(config['autotag_dirs']) > 0: auto_tag(dir_dict, 'directory', mtime_unix, None, None) # check plugins for adding extra meta data to dirmeta_dict for plugin in plugins: try: # check if plugin is for directory doc mappings = {'mappings': {'directory': {'properties': {}}}} plugin.add_mappings(mappings) dir_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # store in fake_dirs s3threadlock.acquire() fake_dirs.append(fullpath) s3threadlock.release() return dir_dict
def qumulo_get_file_meta(worker_name, path, cliargs, reindex_dict): filename = path['name'] # check if file is in exluded_files list extension = os.path.splitext(filename)[1][1:].strip().lower() if file_excluded(filename, extension): return None # get file size (bytes) size = int(path['size']) # Skip files smaller than minsize cli flag if size < cliargs['minsize']: return None # check file modified time mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) # Convert time in days (mtime cli arg) to seconds time_sec = cliargs['mtime'] * 86400 file_mtime_sec = time.time() - mtime_unix # Only process files modified at least x days ago if file_mtime_sec < time_sec: return None # get change time ctime_utc = path['change_time'] ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S')) # get creation time creation_time_utc = path['creation_time'] # create md5 hash of file using metadata filesize and mtime filestring = str(size) + str(mtime_unix) filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest() # get time indextime_utc = datetime.utcnow().isoformat() # get absolute path of parent directory parentdir = os.path.abspath(os.path.join(path['path'], os.pardir)) # get user id of owner uid = path['owner'] # try to get owner user name # first check cache if uid in uids: owner = owners[uid] # not in cache else: owner = uid # store it in cache if not uid in uids: uids.append(uid) owners[uid] = owner # get group id gid = path['group'] # try to get group name # first check cache if gid in gids: group = groups[gid] # not in cache else: group = gid # store in cache if not gid in gids: gids.append(gid) groups[gid] = group # create file metadata dictionary filemeta_dict = { "filename": filename, "extension": extension, "path_parent": parentdir, "filesize": size, "owner": owner, "group": group, "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": str(path['id']), "filehash": filehash, "tag": "", "tag_custom": "", "dupe_md5": "", "indexing_date": indextime_utc, "worker_name": worker_name, "_type": "file" } # check plugins for adding extra meta data to filemeta_dict for plugin in plugins: try: # check if plugin is for file doc mappings = {'mappings': {'file': {'properties': {}}}} plugin.add_mappings(mappings) filemeta_dict.update(plugin.add_meta(path['path'])) except KeyError: pass # add any autotags to filemeta_dict if cliargs['autotag'] and len(config['autotag_files']) > 0: auto_tag(filemeta_dict, 'file', mtime_unix, None, ctime_unix) # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['file']: if sublist[0] == path['path']: filemeta_dict['tag'] = sublist[1] filemeta_dict['tag_custom'] = sublist[2] break return filemeta_dict
def qumulo_get_dir_meta(worker_name, path, cliargs, reindex_dict, redis_conn): if path['path'] != '/': fullpath = path['path'].rstrip(os.path.sep) else: fullpath = path['path'] mtime_utc = path['modification_time'] mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) ctime_utc = path['change_time'] ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S')) creation_time_utc = path['creation_time'] if cliargs['index2']: # check if directory times cached in Redis redis_dirtime = redis_conn.get(base64.encodestring(fullpath.encode('utf-8', errors='ignore'))) if redis_dirtime: cached_times = float(redis_dirtime.decode('utf-8')) # check if cached times are the same as on disk current_times = float(mtime_unix + ctime_unix) if cached_times == current_times: return "sametimes" # get time now in utc indextime_utc = datetime.utcnow().isoformat() # get user id of owner uid = path['owner'] # try to get owner user name # first check cache if uid in uids: owner = owners[uid] # not in cache else: owner = uid # store it in cache if not uid in uids: uids.append(uid) owners[uid] = owner # get group id gid = path['group'] # try to get group name # first check cache if gid in gids: group = groups[gid] # not in cache else: group = gid # store in cache if not gid in gids: gids.append(gid) groups[gid] = group filename = path['name'] parentdir = os.path.abspath(os.path.join(fullpath, os.pardir)) dirmeta_dict = { "filename": filename, "path_parent": parentdir, "filesize": 0, "items": 1, # 1 for itself "items_files": 0, "items_subdirs": 0, "last_modified": mtime_utc, "creation_time": creation_time_utc, "last_change": ctime_utc, "hardlinks": path['num_links'], "inode": str(path['id']), "owner": owner, "group": group, "tag": "", "tag_custom": "", "indexing_date": indextime_utc, "worker_name": worker_name, "change_percent_filesize": "", "change_percent_items": "", "change_percent_items_files": "", "change_percent_items_subdirs": "", "_type": "directory" } # check plugins for adding extra meta data to dirmeta_dict for plugin in plugins: try: # check if plugin is for directory doc mappings = {'mappings': {'directory': {'properties': {}}}} plugin.add_mappings(mappings) dirmeta_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # add any autotags to dirmeta_dict if cliargs['autotag'] and len(config['autotag_dirs']) > 0: auto_tag(dirmeta_dict, 'directory', mtime_unix, None, ctime_unix) # search for and copy over any existing tags from reindex_dict for sublist in reindex_dict['directory']: if sublist[0] == fullpath: dirmeta_dict['tag'] = sublist[1] dirmeta_dict['tag_custom'] = sublist[2] break # cache directory times in Redis if config['redis_cachedirtimes'] == 'True' or config['redis_cachedirtimes'] == 'true': redis_conn.set(base64.encodestring(fullpath.encode('utf-8', errors='ignore')), mtime_unix + ctime_unix, ex=config['redis_dirtimesttl']) return dirmeta_dict
def process_line(row, tree_dirs, tree_files, cliargs): global fake_dirs n = 2 # S3 Inventory csv column headers inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]} try: inventory_dict['s3_size'] = int(row[n]) n = n + 1 except IndexError: pass try: inventory_dict['s3_last_modified_date'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_etag'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_storage_class'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_multipart_upload'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_replication_status'] = row[n] n = n + 1 except IndexError: pass try: inventory_dict['s3_encryption_status'] = row[n] except IndexError: pass # prepare inventory dict for diskover index # fake path /s3/bucketname/key bucket = '/s3/' + row[0] + '/' path = os.path.join(bucket, inventory_dict['s3_key']) # check if directory if path.endswith('/'): isdir = True path = path.rstrip('/') s3threadlock.acquire() fake_dirs.append(path) s3threadlock.release() else: isdir = False # add any directories in path to fake dirs splitpath = inventory_dict['s3_key'].split('/') # remove file name splitpath = splitpath[:-1] prev_path = bucket.rstrip('/') for p in splitpath: # create fake directory entry s3threadlock.acquire() dir_dict = make_fake_s3_dir(prev_path, p, cliargs) s3threadlock.release() current_path = os.path.join(prev_path, p) if dir_dict is None: prev_path = current_path continue tree_dirs.append(dir_dict) # increment items counts of parentdir for d in tree_dirs: if d['filename'] == os.path.basename(dir_dict['path_parent']): d['items_subdirs'] += 1 d['items'] += 1 break prev_path = current_path size = inventory_dict['s3_size'] # filename filename = os.path.basename(path) # check if file is in exluded_files list extension = os.path.splitext(filename)[1][1:].strip().lower() if file_excluded(filename, extension, path, cliargs['verbose']): return tree_dirs, tree_files # Skip files smaller than minsize cli flag if not isdir and size < cliargs['minsize']: return tree_dirs, tree_files # modified time mtime_utc = inventory_dict['s3_last_modified_date'].partition('.')[0] # modified time in unix mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S')) # get time indextime_utc = datetime.utcnow().isoformat() # get absolute path of parent directory parentdir = os.path.abspath(os.path.join(path, os.pardir)) # absolute full path fullpath = os.path.abspath(os.path.join(parentdir, filename)) # remove any keys (fields) we don't want to add to ES inventory_dict.pop('s3_size', None) inventory_dict.pop('s3_last_modified_date', None) if isdir: # directory inventory_dict['filename'] = filename inventory_dict['path_parent'] = parentdir inventory_dict["filesize"] = 0 inventory_dict["items"] = 1 # 1 for itself inventory_dict["items_files"] = 0 inventory_dict["items_subdirs"] = 0 inventory_dict["last_modified"] = mtime_utc inventory_dict["tag"] = "" inventory_dict["tag_custom"] = "" inventory_dict["indexing_date"] = indextime_utc inventory_dict["worker_name"] = workername inventory_dict["change_percent_filesize"] = "" inventory_dict["change_percent_items"] = "" inventory_dict["change_percent_items_files"] = "" inventory_dict["change_percent_items_subdirs"] = "" inventory_dict["_type"] = "directory" # increment items counts of parentdir for d in tree_dirs: if d['filename'] == os.path.basename(parentdir): d['items_subdirs'] += 1 d['items'] += 1 break # add any autotags to inventory_dict if cliargs['autotag'] and len(config['autotag_dirs']) > 0: auto_tag(inventory_dict, 'directory', mtime_unix, None, None) # check plugins for adding extra meta data to dirmeta_dict for plugin in plugins: try: # check if plugin is for directory doc mappings = {'mappings': {'directory': {'properties': {}}}} plugin.add_mappings(mappings) inventory_dict.update(plugin.add_meta(fullpath)) except KeyError: pass tree_dirs.append(inventory_dict) else: # file # Convert time in days (mtime cli arg) to seconds time_sec = cliargs['mtime'] * 86400 file_mtime_sec = time.time() - mtime_unix # Only process files modified at least x days ago if file_mtime_sec < time_sec: return tree_files, tree_dirs # create md5 hash of file using metadata filesize and mtime filestring = str(size) + str(mtime_unix) filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest() inventory_dict['filename'] = filename inventory_dict['path_parent'] = parentdir inventory_dict["extension"] = extension inventory_dict["filesize"] = size inventory_dict["last_modified"] = mtime_utc inventory_dict["filehash"] = filehash inventory_dict["tag"] = "" inventory_dict["tag_custom"] = "" inventory_dict["dupe_md5"] = "" inventory_dict["indexing_date"] = indextime_utc inventory_dict["worker_name"] = workername inventory_dict["_type"] = "file" # add file size and increment items counts to parentdir for d in tree_dirs: if d['filename'] == os.path.basename(parentdir): d['filesize'] += size d['items_files'] += 1 d['items'] += 1 break # check plugins for adding extra meta data to inventory_dict for plugin in plugins: try: # check if plugin is for file doc mappings = {'mappings': {'file': {'properties': {}}}} plugin.add_mappings(mappings) inventory_dict.update(plugin.add_meta(fullpath)) except KeyError: pass # add any autotags to inventory_dict if cliargs['autotag'] and len(config['autotag_files']) > 0: auto_tag(inventory_dict, 'file', mtime_unix, None, None) tree_files.append(inventory_dict) return tree_dirs, tree_files