예제 #1
0
def get_files(index, path):
    newpath = escape_chars(path)
    if newpath == '\/':
        newpathwildcard = '\/*'
    else:
        newpathwildcard = newpath + '\/*'
    logger.info('Searching for all file docs in %s for path %s...', index,
                path)
    data = {
        '_source': [
            'path_parent', 'filename', 'last_modified', 'last_access',
            'last_change'
        ],
        'query': {
            'query_string': {
                'query':
                '(path_parent: ' + newpath + ') OR '
                '(path_parent: ' + newpathwildcard + ') OR (filename: "' +
                os.path.basename(path) + '" AND path_parent: "' +
                os.path.abspath(os.path.join(path, os.pardir)) + '")',
            }
        }
    }
    es.indices.refresh(index)
    res = es.search(index=index,
                    doc_type='file',
                    scroll='1m',
                    size=config['es_scrollsize'],
                    body=data,
                    request_timeout=config['es_timeout'])
    filelist = []
    filelist_hashed = []
    filelist_times = []
    doccount = 0
    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            fullpath = os.path.abspath(
                os.path.join(hit['_source']['path_parent'],
                             hit['_source']['filename']))
            mtime = time.mktime(
                datetime.strptime(hit['_source']['last_modified'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            ctime = time.mktime(
                datetime.strptime(hit['_source']['last_change'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            atime = time.mktime(
                datetime.strptime(hit['_source']['last_access'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            filelist.append(fullpath)
            filelist_hashed.append(
                hashlib.md5(fullpath.encode('utf-8')).hexdigest())
            filelist_times.append((mtime, ctime, atime))
            doccount += 1
        # use es scroll api
        res = es.scroll(scroll_id=res['_scroll_id'],
                        scroll='1m',
                        request_timeout=config['es_timeout'])
    logger.info('Found %s file docs' % str(doccount))
    return filelist, filelist_hashed, filelist_times
예제 #2
0
def get_metadata(path, cliargs):
    dir_source = ""
    filename = diskover.escape_chars(os.path.basename(path))
    parent_dir = diskover.escape_chars(
        os.path.abspath(os.path.join(path, os.pardir)))
    fullpath = diskover.escape_chars(os.path.abspath(path))

    data = {
        "size": 1,
        "query": {
            "query_string": {
                "query":
                "filename: " + filename + " AND path_parent: " + parent_dir
            }
        }
    }
    res = es.search(index=cliargs['index2'],
                    doc_type='directory',
                    body=data,
                    request_timeout=diskover.config['es_timeout'])
    try:
        dir_source = res['hits']['hits'][0]['_source']
    except IndexError:
        pass

    data = {"query": {"query_string": {"query": "path_parent: " + fullpath}}}
    files_source = []
    res = es.search(index=cliargs['index2'],
                    doc_type='file',
                    scroll='1m',
                    size=1000,
                    body=data,
                    request_timeout=diskover.config['es_timeout'])

    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            files_source.append(hit['_source'])
        # get es scroll id
        scroll_id = res['_scroll_id']
        # use es scroll api
        res = es.scroll(scroll_id=scroll_id,
                        scroll='1m',
                        request_timeout=diskover.config['es_timeout'])

    return dir_source, files_source
예제 #3
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue search ES for all 
    files in each directory (recursive) and sums their filesizes 
    to create a total filesize and item count for each dir, 
    then pdates dir doc's filesize and items fields.
    """
    doclist = []

    for path in dirlist:
        totalsize = 0
        totalitems = 1  # 1 for itself
        totalitems_files = 0
        totalitems_subdirs = 0
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='file', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_files += res['hits']['total']

        # total file size sum
        totalsize += res['aggregations']['total_size']['value']

        # directory doc search (subdirs)

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='directory', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_subdirs += res['hits']['total']

        # total items
        totalitems += totalitems_files + totalitems_subdirs

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'filesize': totalsize, 'items': totalitems,
                    'items_files': totalitems_files,
                    'items_subdirs': totalitems_subdirs}
        }
        # add total cost per gb to doc
        if cliargs['costpergb']:
            d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory')
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
예제 #4
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all files
    in each directory (recursive) and sums their filesizes
    to create a total filesize and item count for each dir.
    Updates dir doc's filesize and items fields.
    """
    bot_logger = bot_log_setup(cliargs)
    jobstart = time.time()
    bot_logger.info('*** Calculating directory sizes...')

    for path in dirlist:
        totalsize = 0
        totalitems = 1  # itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = diskover.escape_chars(path[1])
        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        'path_parent: ' + newpath + ' '
                        'OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'],
                        doc_type='file',
                        body=data,
                        request_timeout=diskover.config['es_timeout'])

        # total items sum
        totalitems += res['hits']['total']

        # total file size sum
        totalsize += res['aggregations']['total_size']['value']

        # directory doc search (subdirs)

        # search ES and start scroll
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=diskover.config['es_timeout'])

        # total items sum
        totalitems += res['hits']['total']

        # ES id of directory doc
        directoryid = path[0]

        # update filesize and items fields for directory (path) doc
        es.update(index=cliargs['index'],
                  id=directoryid,
                  doc_type='directory',
                  body={"doc": {
                      'filesize': totalsize,
                      'items': totalitems
                  }})

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED CALC DIR, Elapsed Time: ' +
                    str(elapsed_time))
예제 #5
0
def get_files_gen(eshost, esver7, index, path):
    newpath = escape_chars(path)
    if newpath == '\/':
        newpathwildcard = '\/*'
    else:
        newpathwildcard = newpath + '\/*'
    logger.info('Searching for all file docs in %s for path %s...', index,
                path)
    eshost.indices.refresh(index)
    if esver7:
        data = {
            '_source': [
                'path_parent', 'filename', 'filesize', 'last_modified',
                'last_access', 'last_change'
            ],
            'query': {
                'query_string': {
                    'query':
                    '((path_parent: ' + newpath + ') OR '
                    '(path_parent: ' + newpathwildcard + ') OR (filename: "' +
                    os.path.basename(path) + '" AND path_parent: "' +
                    os.path.abspath(os.path.join(path, os.pardir)) +
                    '")) AND type:file',
                }
            }
        }
        res = eshost.search(index=index,
                            scroll='1m',
                            size=config['es_scrollsize'],
                            body=data,
                            request_timeout=config['es_timeout'])
    else:
        data = {
            '_source': [
                'path_parent', 'filename', 'filesize', 'last_modified',
                'last_access', 'last_change'
            ],
            'query': {
                'query_string': {
                    'query':
                    '(path_parent: ' + newpath + ') OR '
                    '(path_parent: ' + newpathwildcard + ') OR (filename: "' +
                    os.path.basename(path) + '" AND path_parent: "' +
                    os.path.abspath(os.path.join(path, os.pardir)) + '")',
                }
            }
        }
        res = eshost.search(index=index,
                            doc_type='file',
                            scroll='1m',
                            size=config['es_scrollsize'],
                            body=data,
                            request_timeout=config['es_timeout'])

    while res['hits']['hits'] and len(res['hits']['hits']) > 0:
        for hit in res['hits']['hits']:
            fullpath = os.path.abspath(
                os.path.join(hit['_source']['path_parent'],
                             hit['_source']['filename']))
            size = hit['_source']['filesize']
            if args['rootdir2'] != args['rootdir']:
                fullpath_rep = replace_path(fullpath, args['rootdir2'],
                                            args['rootdir'])
            file_hashed = hashlib.md5(fullpath_rep.encode('utf-8')).hexdigest()
            mtime = time.mktime(
                datetime.strptime(hit['_source']['last_modified'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            ctime = time.mktime(
                datetime.strptime(hit['_source']['last_change'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())
            atime = time.mktime(
                datetime.strptime(hit['_source']['last_access'],
                                  '%Y-%m-%dT%H:%M:%S').timetuple())

            yield fullpath, file_hashed, size, mtime, ctime, atime

        # use es scroll api
        res = eshost.scroll(scroll_id=res['_scroll_id'],
                            scroll='1m',
                            request_timeout=config['es_timeout'])
예제 #6
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(dirlist) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_calc)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(dirlist)
            n = len(dirlist) // 2
            tossdirs = dirlist[:n]
            dirlist = dirlist[n:]
            q_crawl.enqueue(calc_dir_size, args=(
                tossdirs,
                cliargs,
            ))

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
예제 #7
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)