Exemplo n.º 1
0
def make_fake_s3_dir(parent, file, cliargs):
    """Make fake s3 directory function.
    Creates a fake directory doc for es.
    Returns dictionary for directory doc.
    """
    global fake_dirs

    fullpath = os.path.abspath(os.path.join(parent, file))

    if fullpath in fake_dirs:
        return None

    mtime_utc = "1970-01-01T00:00:00"
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    dir_dict = {}
    dir_dict['filename'] = file
    dir_dict['path_parent'] = parent
    dir_dict["filesize"] = 0
    dir_dict["items"] = 1  # 1 for itself
    dir_dict["items_files"] = 0
    dir_dict["items_subdirs"] = 0
    dir_dict["last_modified"] = mtime_utc
    dir_dict["tag"] = ""
    dir_dict["tag_custom"] = ""
    dir_dict["indexing_date"] = datetime.utcnow().isoformat()
    dir_dict["worker_name"] = workername
    dir_dict["change_percent_filesize"] = ""
    dir_dict["change_percent_items"] = ""
    dir_dict["change_percent_items_files"] = ""
    dir_dict["change_percent_items_subdirs"] = ""
    dir_dict["_type"] = "directory"

    # add any autotags to inventory_dict
    if cliargs['autotag'] and len(diskover.config['autotag_dirs']) > 0:
        diskover_worker_bot.auto_tag(dir_dict, 'directory', mtime_unix, None,
                                     None)

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dir_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # store in fake_dirs
    s3threadlock.acquire()
    fake_dirs.append(fullpath)
    s3threadlock.release()

    return dir_dict
Exemplo n.º 2
0
def qumulo_get_file_meta(path, cliargs, reindex_dict):
    filename = path['name']

    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if diskover_worker_bot.file_excluded(filename, extension, path['path'],
                                         cliargs['verbose']):
        return None

    # get file size (bytes)
    size = path['size']

    # Skip files smaller than minsize cli flag
    if size < cliargs['minsize']:
        return None

    # check file modified time
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))

    # Convert time in days (mtime cli arg) to seconds
    time_sec = cliargs['mtime'] * 86400
    file_mtime_sec = time.time() - mtime_unix
    # Only process files modified at least x days ago
    if file_mtime_sec < time_sec:
        return None

    # get change time
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    # get creation time
    creation_time_utc = path['creation_time']

    # create md5 hash of file using metadata filesize and mtime
    filestring = str(size) + str(mtime_unix)
    filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()
    # get time
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path['path'], os.pardir))
    # get user id of owner
    uid = int(path['owner'])
    # try to get owner user name
    # first check cache
    if uid in diskover_worker_bot.uids:
        owner = diskover_worker_bot.owners[uid]
    # not in cache
    else:
        try:
            owner = pwd.getpwuid(uid).pw_name.split('\\')
            # remove domain before owner
            if len(owner) == 2:
                owner = owner[1]
            else:
                owner = owner[0]
        # if we can't find the owner's user name, use the uid number
        except KeyError:
            owner = uid
        # store it in cache
        if not uid in diskover_worker_bot.uids:
            diskover_worker_bot.uids.append(uid)
            diskover_worker_bot.owners[uid] = owner
    # get group id
    gid = int(path['group'])
    # try to get group name
    # first check cache
    if gid in diskover_worker_bot.gids:
        group = diskover_worker_bot.groups[gid]
    # not in cache
    else:
        try:
            group = grp.getgrgid(gid).gr_name.split('\\')
            # remove domain before group
            if len(group) == 2:
                group = group[1]
            else:
                group = group[0]
        # if we can't find the group name, use the gid number
        except KeyError:
            group = gid
        # store in cache
        if not gid in diskover_worker_bot.gids:
            diskover_worker_bot.gids.append(gid)
            diskover_worker_bot.groups[gid] = group

    # create file metadata dictionary
    filemeta_dict = {
        "filename": filename,
        "extension": extension,
        "path_parent": parentdir,
        "filesize": size,
        "owner": owner,
        "group": group,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "filehash": filehash,
        "tag": "",
        "tag_custom": "",
        "dupe_md5": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name()
    }

    # check plugins for adding extra meta data to filemeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for file doc
            mappings = {'mappings': {'file': {'properties': {}}}}
            plugin.add_mappings(mappings)
            filemeta_dict.update(plugin.add_meta(path['path']))
        except KeyError:
            pass

    # add any autotags to filemeta_dict
    if cliargs['autotag'] and len(diskover.config['autotag_files']) > 0:
        diskover_worker_bot.auto_tag(filemeta_dict, 'file', mtime_unix, None,
                                     ctime_unix)

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['file']:
        if sublist[0] == path['path']:
            filemeta_dict['tag'] = sublist[1]
            filemeta_dict['tag_custom'] = sublist[2]
            break

    return filemeta_dict
Exemplo n.º 3
0
def qumulo_get_dir_meta(path, cliargs, reindex_dict, redis_conn):
    if path['path'] != '/':
        fullpath = path['path'].rstrip(os.path.sep)
    else:
        fullpath = path['path']
    mtime_utc = path['modification_time']
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    ctime_utc = path['change_time']
    ctime_unix = time.mktime(time.strptime(ctime_utc, '%Y-%m-%dT%H:%M:%S'))
    creation_time_utc = path['creation_time']
    if cliargs['index2']:
        # check if directory times cached in Redis
        redis_dirtime = redis_conn.get(
            fullpath.encode('utf-8', errors='ignore'))
        if redis_dirtime:
            cached_times = float(redis_dirtime.decode('utf-8'))
            # check if cached times are the same as on disk
            current_times = float(mtime_unix + ctime_unix)
            if cached_times == current_times:
                return "sametimes"
    # get time now in utc
    indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
    # get user id of owner
    try:
        uid = int(path['owner'])
        # try to get owner user name
        # first check cache
        if uid in diskover_worker_bot.uids:
            owner = diskover_worker_bot.owners[uid]
        # not in cache
        else:
            try:
                owner = pwd.getpwuid(uid).pw_name.split('\\')
                # remove domain before owner
                if len(owner) == 2:
                    owner = owner[1]
                else:
                    owner = owner[0]
            # if we can't find the owner's user name, use the uid number
            except KeyError:
                owner = uid
            # store it in cache
            if not uid in diskover_worker_bot.uids:
                diskover_worker_bot.uids.append(uid)
                diskover_worker_bot.owners[uid] = owner
    except ValueError:  # Qumulo local user type
        owner = path['owner']
    # get group id
    try:
        gid = int(path['group'])
        # try to get group name
        # first check cache
        if gid in diskover_worker_bot.gids:
            group = diskover_worker_bot.groups[gid]
        # not in cache
        else:
            try:
                group = grp.getgrgid(gid).gr_name.split('\\')
                # remove domain before group
                if len(group) == 2:
                    group = group[1]
                else:
                    group = group[0]
            # if we can't find the group name, use the gid number
            except KeyError:
                group = gid
            # store in cache
            if not gid in diskover_worker_bot.gids:
                diskover_worker_bot.gids.append(gid)
                diskover_worker_bot.groups[gid] = group
    except ValueError:  # Qumulo local group type
        group = path['group']

    filename = path['name']
    parentdir = os.path.abspath(os.path.join(fullpath, os.pardir))

    dirmeta_dict = {
        "filename": filename,
        "path_parent": parentdir,
        "filesize": 0,
        "items": 1,  # 1 for itself
        "items_files": 0,
        "items_subdirs": 0,
        "last_modified": mtime_utc,
        "creation_time": creation_time_utc,
        "last_change": ctime_utc,
        "hardlinks": path['num_links'],
        "inode": path['id'],
        "owner": owner,
        "group": group,
        "tag": "",
        "tag_custom": "",
        "indexing_date": indextime_utc,
        "worker_name": diskover_worker_bot.get_worker_name(),
        "change_percent_filesize": "",
        "change_percent_items": "",
        "change_percent_items_files": "",
        "change_percent_items_subdirs": ""
    }

    # check plugins for adding extra meta data to dirmeta_dict
    for plugin in diskover.plugins:
        try:
            # check if plugin is for directory doc
            mappings = {'mappings': {'directory': {'properties': {}}}}
            plugin.add_mappings(mappings)
            dirmeta_dict.update(plugin.add_meta(fullpath))
        except KeyError:
            pass

    # add any autotags to dirmeta_dict
    if cliargs['autotag'] and len(diskover.config['autotag_dirs']) > 0:
        diskover_worker_bot.auto_tag(dirmeta_dict, 'directory', mtime_unix,
                                     None, ctime_unix)

    # search for and copy over any existing tags from reindex_dict
    for sublist in reindex_dict['directory']:
        if sublist[0] == fullpath:
            dirmeta_dict['tag'] = sublist[1]
            dirmeta_dict['tag_custom'] = sublist[2]
            break

    # cache directory times in Redis
    redis_conn.set(fullpath.encode('utf-8', errors='ignore'),
                   mtime_unix + ctime_unix,
                   ex=diskover.config['redis_dirtimesttl'])

    return dirmeta_dict
Exemplo n.º 4
0
def process_line(row, tree_dirs, tree_files, tree_crawltimes, cliargs):
    global fake_dirs

    starttime = time.time()
    n = 2
    # S3 Inventory csv column headers
    inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
    try:
        inventory_dict['s3_size'] = int(row[n])
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_last_modified_date'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_etag'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_storage_class'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_multipart_upload'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_replication_status'] = row[n]
        n = n + 1
    except IndexError:
        pass
    try:
        inventory_dict['s3_encryption_status'] = row[n]
    except IndexError:
        pass

    # prepare inventory dict for diskover index

    # fake path /s3/bucketname/key
    bucket = '/s3/' + row[0] + '/'
    path = os.path.join(bucket, inventory_dict['s3_key'])
    # check if directory
    if path.endswith('/'):
        isdir = True
        path = path.rstrip('/')
        s3threadlock.acquire()
        fake_dirs.append(path)
        s3threadlock.release()
    else:
        isdir = False
        # add any directories in path to fake dirs
        splitpath = inventory_dict['s3_key'].split('/')
        # remove file name
        splitpath = splitpath[:-1]
        prev_path = bucket.rstrip('/')
        for p in splitpath:
            # create fake directory entry
            s3threadlock.acquire()
            dir_dict = make_fake_s3_dir(prev_path, p, cliargs)
            s3threadlock.release()
            current_path = os.path.join(prev_path, p)
            if dir_dict is None:
                prev_path = current_path
                continue
            tree_dirs.append(dir_dict)
            # create fake crawltime entry
            tree_crawltimes.append({
                "path":
                current_path,
                "worker_name":
                workername,
                "crawl_time":
                0,
                "indexing_date":
                datetime.utcnow().isoformat(),
                "_type":
                "crawlstat"
            })
            prev_path = current_path

    size = inventory_dict['s3_size']
    # filename
    filename = os.path.basename(path)
    # check if file is in exluded_files list
    extension = os.path.splitext(filename)[1][1:].strip().lower()
    if diskover_worker_bot.file_excluded(filename, extension, path,
                                         cliargs['verbose']):
        return tree_dirs, tree_files, tree_crawltimes
    # Skip files smaller than minsize cli flag
    if not isdir and size < cliargs['minsize']:
        return tree_dirs, tree_files, tree_crawltimes
    # modified time
    mtime_utc = inventory_dict['s3_last_modified_date'].partition('.')[0]
    # modified time in unix
    mtime_unix = time.mktime(time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
    # get time
    indextime_utc = datetime.utcnow().isoformat()
    # get absolute path of parent directory
    parentdir = os.path.abspath(os.path.join(path, os.pardir))
    # absolute full path
    fullpath = os.path.abspath(os.path.join(parentdir, filename))

    # remove any keys (fields) we don't want to add to ES
    inventory_dict.pop('s3_size', None)
    inventory_dict.pop('s3_last_modified_date', None)

    if isdir:  # directory
        inventory_dict['filename'] = filename
        inventory_dict['path_parent'] = parentdir
        inventory_dict["filesize"] = 0
        inventory_dict["items"] = 1  # 1 for itself
        inventory_dict["items_files"] = 0
        inventory_dict["items_subdirs"] = 0
        inventory_dict["last_modified"] = mtime_utc
        inventory_dict["tag"] = ""
        inventory_dict["tag_custom"] = ""
        inventory_dict["indexing_date"] = indextime_utc
        inventory_dict["worker_name"] = workername
        inventory_dict["change_percent_filesize"] = ""
        inventory_dict["change_percent_items"] = ""
        inventory_dict["change_percent_items_files"] = ""
        inventory_dict["change_percent_items_subdirs"] = ""
        inventory_dict["_type"] = "directory"

        # add any autotags to inventory_dict
        if cliargs['autotag'] and len(diskover.config['autotag_dirs']) > 0:
            diskover_worker_bot.auto_tag(inventory_dict, 'directory',
                                         mtime_unix, None, None)

        # check plugins for adding extra meta data to dirmeta_dict
        for plugin in diskover.plugins:
            try:
                # check if plugin is for directory doc
                mappings = {'mappings': {'directory': {'properties': {}}}}
                plugin.add_mappings(mappings)
                inventory_dict.update(plugin.add_meta(fullpath))
            except KeyError:
                pass

        tree_dirs.append(inventory_dict)
        tree_crawltimes.append({
            "path": path,
            "worker_name": workername,
            "crawl_time": time.time() - starttime,
            "indexing_date": datetime.utcnow().isoformat(),
            "_type": "crawlstat"
        })

    else:  # file
        # Convert time in days (mtime cli arg) to seconds
        time_sec = cliargs['mtime'] * 86400
        file_mtime_sec = time.time() - mtime_unix
        # Only process files modified at least x days ago
        if file_mtime_sec < time_sec:
            return tree_files, tree_dirs, tree_crawltimes
        # create md5 hash of file using metadata filesize and mtime
        filestring = str(size) + str(mtime_unix)
        filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()

        inventory_dict['filename'] = filename
        inventory_dict['path_parent'] = parentdir
        inventory_dict["extension"] = extension
        inventory_dict["filesize"] = size
        inventory_dict["last_modified"] = mtime_utc
        inventory_dict["filehash"] = filehash
        inventory_dict["tag"] = ""
        inventory_dict["tag_custom"] = ""
        inventory_dict["dupe_md5"] = ""
        inventory_dict["indexing_date"] = indextime_utc
        inventory_dict["worker_name"] = workername
        inventory_dict["_type"] = "file"

        # check plugins for adding extra meta data to inventory_dict
        for plugin in diskover.plugins:
            try:
                # check if plugin is for file doc
                mappings = {'mappings': {'file': {'properties': {}}}}
                plugin.add_mappings(mappings)
                inventory_dict.update(plugin.add_meta(fullpath))
            except KeyError:
                pass

        # add any autotags to inventory_dict
        if cliargs['autotag'] and len(diskover.config['autotag_files']) > 0:
            diskover_worker_bot.auto_tag(inventory_dict, 'file', mtime_unix,
                                         None, None)

        tree_files.append(inventory_dict)

    return tree_dirs, tree_files, tree_crawltimes
Exemplo n.º 5
0
def process_s3_inventory(inventory_file, cliargs):
    """Process s3 inventory function.
    Takes an S3 inventory file (gzipped csv), processes and bulk adds it
    into diskover index.
    """
    jobstart = time.time()
    tree = []
    workername = diskover_worker_bot.get_worker_name()

    with gzip.open(inventory_file, mode='rt') as f:
        reader = csv.reader(f, delimiter=',', quotechar='"')
        x = 0
        for row in reader:
            if x == 0:
                # create fake root /bucketname directory entry for s3 bucket
                time_utc_now = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
                time_utc_epoch_start = "1970-01-01T00:00:00"
                root_dict = {}
                root_dict['filename'] = row[0]
                root_dict['path_parent'] = "/s3"
                root_dict["filesize"] = 0
                root_dict["items"] = 1  # 1 for itself
                root_dict["items_files"] = 0
                root_dict["items_subdirs"] = 0
                root_dict["last_modified"] = time_utc_epoch_start
                root_dict["tag"] = ""
                root_dict["tag_custom"] = ""
                root_dict["indexing_date"] = time_utc_now
                root_dict["worker_name"] = workername
                root_dict["change_percent_filesize"] = ""
                root_dict["change_percent_items"] = ""
                root_dict["change_percent_items_files"] = ""
                root_dict["change_percent_items_subdirs"] = ""
                tree.append(('directory', root_dict))
                tree.append(('crawltime', '/s3/' + row[0], 0))
            starttime = time.time()
            n = 2
            # S3 Inventory csv column headers
            inventory_dict = {'s3_bucket': row[0], 's3_key': row[1]}
            try:
                inventory_dict['s3_size'] = int(row[n])
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_last_modified_date'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_etag'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_storage_class'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_multipart_upload'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_replication_status'] = row[n]
                n = n + 1
            except IndexError:
                pass
            try:
                inventory_dict['s3_encryption_status'] = row[n]
            except IndexError:
                pass

            # prepare inventory dict for diskover index

            # fake path /s3/bucketname/key
            bucket = '/s3/' + row[0] + '/'
            path = os.path.join(bucket, inventory_dict['s3_key'])
            # check if directory
            if path.endswith('/'):
                isdir = True
                path = path.rstrip('/')
            else:
                isdir = False
            size = inventory_dict['s3_size']
            # filename
            filename = os.path.basename(path)
            # check if file is in exluded_files list
            extension = os.path.splitext(filename)[1][1:].strip().lower()
            if diskover_worker_bot.file_excluded(filename, extension, path,
                                                 cliargs['verbose']):
                continue
            # Skip files smaller than minsize cli flag
            if not isdir and size < cliargs['minsize']:
                continue
            # modified time
            mtime_utc = inventory_dict['s3_last_modified_date'].partition(
                '.')[0]
            # modified time in unix
            mtime_unix = time.mktime(
                time.strptime(mtime_utc, '%Y-%m-%dT%H:%M:%S'))
            # get time
            indextime_utc = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
            # get absolute path of parent directory
            parentdir = os.path.abspath(os.path.join(path, os.pardir))
            # absolute full path
            fullpath = os.path.abspath(os.path.join(parentdir, filename))

            # remove any keys (fields) we don't want to add to ES
            inventory_dict.pop('s3_size', None)
            inventory_dict.pop('s3_last_modified_date', None)

            if isdir:  # directory
                inventory_dict['filename'] = filename
                inventory_dict['path_parent'] = parentdir
                inventory_dict["filesize"] = 0
                inventory_dict["items"] = 1  # 1 for itself
                inventory_dict["items_files"] = 0
                inventory_dict["items_subdirs"] = 0
                inventory_dict["last_modified"] = mtime_utc
                inventory_dict["tag"] = ""
                inventory_dict["tag_custom"] = ""
                inventory_dict["indexing_date"] = indextime_utc
                inventory_dict["worker_name"] = workername
                inventory_dict["change_percent_filesize"] = ""
                inventory_dict["change_percent_items"] = ""
                inventory_dict["change_percent_items_files"] = ""
                inventory_dict["change_percent_items_subdirs"] = ""

                # add any autotags to inventory_dict
                if cliargs['autotag'] and len(
                        diskover.config['autotag_dirs']) > 0:
                    diskover_worker_bot.auto_tag(inventory_dict, 'directory',
                                                 mtime_unix, None, None)

                # check plugins for adding extra meta data to dirmeta_dict
                for plugin in diskover.plugins:
                    try:
                        # check if plugin is for directory doc
                        mappings = {
                            'mappings': {
                                'directory': {
                                    'properties': {}
                                }
                            }
                        }
                        plugin.add_mappings(mappings)
                        inventory_dict.update(plugin.add_meta(fullpath))
                    except KeyError:
                        pass

                tree.append(('directory', inventory_dict))
                tree.append(('crawltime', path, (time.time() - starttime)))

            else:  # file
                # Convert time in days (mtime cli arg) to seconds
                time_sec = cliargs['mtime'] * 86400
                file_mtime_sec = time.time() - mtime_unix
                # Only process files modified at least x days ago
                if file_mtime_sec < time_sec:
                    continue
                # create md5 hash of file using metadata filesize and mtime
                filestring = str(size) + str(mtime_unix)
                filehash = hashlib.md5(filestring.encode('utf-8')).hexdigest()

                inventory_dict['filename'] = filename
                inventory_dict['path_parent'] = parentdir
                inventory_dict["extension"] = extension
                inventory_dict["filesize"] = size
                inventory_dict["last_modified"] = mtime_utc
                inventory_dict["filehash"] = filehash
                inventory_dict["tag"] = ""
                inventory_dict["tag_custom"] = ""
                inventory_dict["dupe_md5"] = ""
                inventory_dict["indexing_date"] = indextime_utc
                inventory_dict["worker_name"] = workername

                # check plugins for adding extra meta data to inventory_dict
                for plugin in diskover.plugins:
                    try:
                        # check if plugin is for file doc
                        mappings = {'mappings': {'file': {'properties': {}}}}
                        plugin.add_mappings(mappings)
                        inventory_dict.update(plugin.add_meta(fullpath))
                    except KeyError:
                        pass

                # add any autotags to inventory_dict
                if cliargs['autotag'] and len(
                        diskover.config['autotag_files']) > 0:
                    diskover_worker_bot.auto_tag(inventory_dict, 'file',
                                                 mtime_unix, None, None)

                tree.append(('file', inventory_dict))

            if len(tree) >= diskover.config['es_chunksize']:
                diskover_worker_bot.es_bulk_adder(tree, cliargs)
                del tree[:]
            x = x + 1

    if len(tree) > 0:
        diskover_worker_bot.es_bulk_adder(tree, cliargs)
    elapsed_time = round(time.time() - jobstart, 3)
    diskover_worker_bot.bot_logger.info('*** FINISHED JOB, Elapsed Time: ' +
                                        str(elapsed_time))