Exemplo n.º 1
0
def es_bulk_adder(worker_name, docs, cliargs, totalcrawltime=None):
    starttime = time.time()

    if not cliargs['s3']:
        bot_logger.info('*** Bulk adding to ES index...')

    try:
        dirlist, filelist = docs
        diskover.index_bulk_add(es, dirlist, diskover.config, cliargs)
        diskover.index_bulk_add(es, filelist, diskover.config, cliargs)
    except ValueError:
        diskover.index_bulk_add(es, docs, diskover.config, cliargs)

    if not cliargs['reindex'] and not cliargs['reindexrecurs'] and not cliargs[
            'crawlbot']:
        data = {
            "worker_name": worker_name,
            "dir_count": len(dirlist),
            "file_count": len(filelist),
            "bulk_time": round(time.time() - starttime, 6),
            "crawl_time": round(totalcrawltime, 6),
            "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
        }
        es.index(index=cliargs['index'], doc_type='worker', body=data)

    if not cliargs['s3']:
        elapsed_time = round(time.time() - starttime, 6)
        bot_logger.info('*** FINISHED BULK ADDING, Elapsed Time: ' +
                        str(elapsed_time))
Exemplo n.º 2
0
def index_dupes(hashgroup, cliargs):
    """This is the ES dupe_md5 tag update function.
    It updates a file's dupe_md5 field to be md5sum of file
    if it's marked as a duplicate.
    """
    bot_logger = diskover_worker_bot.bot_logger
    # create Elasticsearch connection
    es = diskover.elasticsearch_connect(diskover.config)
    file_id_list = []
    # bulk update data in Elasticsearch index
    for f in hashgroup['files']:
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'file',
            '_id': f['id'],
            'doc': {'dupe_md5': hashgroup['md5sum']}
        }
        file_id_list.append(d)
    if len(file_id_list) > 0:
        if cliargs['verbose']:
            bot_logger.info('Bulk updating %s files in ES index' % len(file_id_list))
        diskover.index_bulk_add(es, file_id_list, 'file', diskover.config, cliargs)
Exemplo n.º 3
0
def es_bulk_adder(result, cliargs, bot_logger):
    worker_name = get_worker_name()
    starttime = time.time()
    dirlist = []
    filelist = []
    crawltimelist = []
    totalcrawltime = 0

    for item in result:
        if item[0] == 'directory':
            dirlist.append(item[1])
        elif item[0] == 'file':
            filelist.append(item[1])
        elif item[0] == 'crawltime':
            crawltimelist.append(item)
            totalcrawltime += item[2]

    bot_logger.info('*** Bulk adding to ES index...')
    diskover.index_bulk_add(es, dirlist, 'directory', diskover.config, cliargs)
    diskover.index_bulk_add(es, filelist, 'file', diskover.config, cliargs)
    if not cliargs['reindex'] and not cliargs['reindexrecurs'] and not cliargs[
            'crawlbot']:
        diskover.add_crawl_stats_bulk(es, crawltimelist, worker_name,
                                      diskover.config, cliargs)
        data = {
            "worker_name": worker_name,
            "dir_count": len(dirlist),
            "file_count": len(filelist),
            "bulk_time": round(time.time() - starttime, 10),
            "crawl_time": round(totalcrawltime, 10),
            "indexing_date": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")
        }
        es.index(index=cliargs['index'], doc_type='worker', body=data)
    elapsed_time = round(time.time() - starttime, 3)
    bot_logger.info('*** FINISHED BULK ADDING, Elapsed Time: ' +
                    str(elapsed_time))
Exemplo n.º 4
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue search ES for all 
    files in each directory (recursive) and sums their filesizes 
    to create a total filesize and item count for each dir, 
    then pdates dir doc's filesize and items fields.
    """
    doclist = []

    for path in dirlist:
        totalsize = 0
        totalitems = 1  # 1 for itself
        totalitems_files = 0
        totalitems_subdirs = 0
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='file', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_files += res['hits']['total']

        # total file size sum
        totalsize += res['aggregations']['total_size']['value']

        # directory doc search (subdirs)

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='directory', body=data,
                        request_timeout=config['es_timeout'])

        # total items sum
        totalitems_subdirs += res['hits']['total']

        # total items
        totalitems += totalitems_files + totalitems_subdirs

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'filesize': totalsize, 'items': totalitems,
                    'items_files': totalitems_files,
                    'items_subdirs': totalitems_subdirs}
        }
        # add total cost per gb to doc
        if cliargs['costpergb']:
            d = cost_per_gb(d, path[1], path[2], path[3], path[4], 'directory')
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
Exemplo n.º 5
0
def calc_hot_dirs(dirlist, cliargs):
    """This is the calculate hotdirs worker function.
    It gets a directory list from the Queue, iterates over the path list
    and searches index2 for the same path and calculates change percent
    between the two. If path not in index2, change percent is 100%.
    Updates index's directory doc's change_percent fields.
    """
    doclist = []

    for path in dirlist:
        # doc search (matching path) in index2
        # filename
        f = os.path.basename(path[1])
        # parent path
        p = os.path.abspath(os.path.join(path[1], os.pardir))

        data = {
            "size": 1,
            "_source": ['filesize', 'items', 'items_files', 'items_subdirs'],
            "query": {
                "query_string": {
                    "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\""
                }
            }
        }

        # search ES
        res = es.search(index=cliargs['hotdirs'], doc_type='directory', body=data,
                        request_timeout=config['es_timeout'])

        # calculate change percent

        # set change percent to 100% if no matching path in index2
        if len(res['hits']['hits']) == 0:
            changepercent_filesize = 100.0
            changepercent_items = 100.0
            changepercent_items_files = 100.0
            changepercent_items_subdirs = 100.0
        else:
            source = res['hits']['hits'][0]['_source']
            # ((new - old) / old) * 100
            try:
                # check if path size in index2 was 0 bytes and set change percent to 100%
                if path[2] > 0 and source['filesize'] == 0:
                    changepercent_filesize = 100.0
                else:
                    changepercent_filesize = round(((path[2] - source['filesize'])
                                                    / source['filesize']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_filesize = 0.0
            try:
                # check if path items in index2 was 0 and set change percent to 100%
                if path[3] > 0 and source['items'] == 0:
                    changepercent_items = 100.0
                else:
                    changepercent_items = round(((path[3] - source['items'])
                                                 / source['items']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items = 0.0
            try:
                # check if path file items in index2 was 0 and set change percent to 100%
                if path[4] > 0 and source['items_files'] == 0:
                    changepercent_items_files = 100.0
                else:
                    changepercent_items_files = round(((path[4] - source['items_files'])
                                                       / source['items_files']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items_files = 0.0
            try:
                # check if path subdir items in index2 was 0 and set change percent to 100%
                if path[5] > 0 and source['items_subdirs'] == 0:
                    changepercent_items_subdirs = 100.0
                else:
                    changepercent_items_subdirs = round(((path[5] - source['items_subdirs'])
                                                         / source['items_subdirs']) * 100.0, 2)
            except ZeroDivisionError:
                changepercent_items_subdirs = 0.0

        # update fields in index
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'change_percent_filesize': changepercent_filesize,
                    'change_percent_items': changepercent_items,
                    'change_percent_items_files': changepercent_items_files,
                    'change_percent_items_subdirs': changepercent_items_subdirs}
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
Exemplo n.º 6
0
def tag_copier(path, cliargs):
    """This is the tag copier worker function.
    It gets a path from the Queue and searches index for the
    same path and copies any existing tags (from index2)
    Updates index's doc's tag and tag_custom fields.
    """
    bot_logger = bot_log_setup(cliargs)
    jobstart = time.time()

    dir_id_list = []
    file_id_list = []

    # doc search (matching path) in index for existing tags from index2
    # filename
    f = os.path.basename(path[0])
    # parent path
    p = os.path.abspath(os.path.join(path[0], os.pardir))

    data = {
        "size": 1,
        "_source": ['tag', 'tag_custom'],
        "query": {
            "query_string": {
                "query":
                "filename: \"" + f + "\" AND path_parent: \"" + p + "\""
            }
        }
    }

    # check if file or directory
    if path[3] == 'directory':
        # search ES
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=diskover.config['es_timeout'])
    else:
        res = es.search(index=cliargs['index'],
                        doc_type='file',
                        body=data,
                        request_timeout=diskover.config['es_timeout'])

    # mark task done if no matching path in index and continue
    if len(res['hits']['hits']) == 0:
        bot_logger.info('*** No matching path found in index')
        return True

    # existing tag in index2
    docid = res['hits']['hits'][0]['_id']

    # update tag and tag_custom fields in index
    d = {
        '_op_type': 'update',
        '_index': cliargs['index'],
        '_type': path[3],
        '_id': docid,
        'doc': {
            'tag': path[1],
            'tag_custom': path[2]
        }
    }
    if path[3] is 'directory':
        dir_id_list.append(d)
    else:
        file_id_list.append(d)

    diskover.index_bulk_add(es, dir_id_list, 'directory', diskover.config,
                            cliargs)
    diskover.index_bulk_add(es, file_id_list, 'file', diskover.config, cliargs)

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED JOB, Elapsed Time: ' + str(elapsed_time))
Exemplo n.º 7
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all files
    in each directory (recursive) and sums their filesizes
    to create a total filesize and item count for each dir.
    Updates dir doc's filesize and items fields.
    """
    jobstart = time.time()
    bot_logger.info('*** Calculating directory sizes...')

    doclist = []
    for path in dirlist:
        totalsize = 0
        totalitems = 1  # 1 for itself
        totalitems_files = 0
        totalitems_subdirs = 0
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = diskover.escape_chars(path[1])
        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query': 'path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard': 'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='file', body=data,
                        request_timeout=diskover.config['es_timeout'])

        # total items sum
        totalitems_files += res['hits']['total']

        # total file size sum
        totalsize += res['aggregations']['total_size']['value']

        # directory doc search (subdirs)

        # search ES and start scroll
        res = es.search(index=cliargs['index'], doc_type='directory', body=data,
                        request_timeout=diskover.config['es_timeout'])

        # total items sum
        totalitems_subdirs += res['hits']['total']

        # total items
        totalitems += totalitems_files + totalitems_subdirs

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {'filesize': totalsize, 'items': totalitems,
                    'items_files': totalitems_files,
                    'items_subdirs': totalitems_subdirs}
        }
        doclist.append(d)

    diskover.index_bulk_add(es, doclist, diskover.config, cliargs)

    elapsed_time = round(time.time() - jobstart, 3)
    bot_logger.info('*** FINISHED CALC DIR, Elapsed Time: ' + str(elapsed_time))
Exemplo n.º 8
0
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None):
    if cliargs['chunkfiles']:
        updated_dirlist = []
        # check for existing directory docs in index and update crawl time only (dirchunk)
        for d in dirlist:
            try:
                path = d[
                    'chunkpath']  # this key determins if its part of a chunked dir
                crawltime = d['crawl_time']
                f = os.path.basename(path)
                # parent path
                p = os.path.abspath(os.path.join(path, os.pardir))

                data = {
                    "size": 1,
                    "_source": ['crawl_time'],
                    "query": {
                        "query_string": {
                            "query":
                            "filename: \"" + f + "\" AND path_parent: \"" + p +
                            "\""
                        }
                    }
                }

                es.indices.refresh(index=cliargs['index'])
                res = es.search(index=cliargs['index'],
                                doc_type='directory',
                                body=data,
                                request_timeout=config['es_timeout'])

                if len(res['hits']['hits']) == 0:
                    continue

                docid = res['hits']['hits'][0]['_id']
                current_crawltime = res['hits']['hits'][0]['_source'][
                    'crawl_time']
                udpated_crawltime = current_crawltime + crawltime

                # update crawltime in index
                d = {
                    '_op_type': 'update',
                    '_index': cliargs['index'],
                    '_type': 'directory',
                    '_id': docid,
                    'doc': {
                        'crawl_time': udpated_crawltime
                    }
                }
            except KeyError:
                pass  # not part of a chunked dir

            updated_dirlist.append(d)

        dirlist = updated_dirlist

    starttime = time.time()

    docs = dirlist + filelist
    index_bulk_add(es, docs, config, cliargs)

    if not cliargs['noworkerdocs']:
        data = {
            "worker_name": worker_name,
            "dir_count": len(dirlist),
            "file_count": len(filelist),
            "bulk_time": round(time.time() - starttime, 6),
            "crawl_time": round(totalcrawltime, 6),
            "indexing_date": datetime.utcnow().isoformat()
        }
        es.index(index=cliargs['index'], doc_type='worker', body=data)
Exemplo n.º 9
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    # check if other bots are idle and throw them some jobs (dir paths)
    if len(dirlist) >= cliargs['batchsize']:
        workers_idle = 0
        workers = Worker.all(connection=redis_conn)
        num_workers = len(workers)
        for w in workers:
            if w._state == "idle":
                workers_idle += 1
            if workers_idle > num_workers // 2:
                workers_idle = True
                break
        q_len = len(q_calc)
        if q_len == 0 and workers_idle == True:
            # take half the paths randomly
            shuffle(dirlist)
            n = len(dirlist) // 2
            tossdirs = dirlist[:n]
            dirlist = dirlist[n:]
            q_crawl.enqueue(calc_dir_size, args=(
                tossdirs,
                cliargs,
            ))

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)
Exemplo n.º 10
0
def calc_dir_size(dirlist, cliargs):
    """This is the calculate directory size worker function.
    It gets a directory list from the Queue and searches ES for all
    subdirs in each directory (recursive) and sums their filesize and
    items fields to create a total filesize and item count for each directory doc.
    Updates directory doc's filesize and items fields.
    """

    doclist = []
    for path in dirlist:
        totalitems = 1  # 1 for itself
        # file doc search with aggregate for sum filesizes
        # escape special characters
        newpath = escape_chars(path[1])
        parentpath = escape_chars(
            os.path.abspath(os.path.join(path[1], os.pardir)))
        pathbasename = escape_chars(os.path.basename(path[1]))

        # create wildcard string and check for / (root) path
        if newpath == '\/':
            newpathwildcard = '\/*'
        else:
            newpathwildcard = newpath + '\/*'

        # check if / (root) path
        if newpath == '\/':
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        "query": "path_parent: " + newpath + "*",
                        "analyze_wildcard": "true"
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    }
                }
            }
        else:
            data = {
                "size": 0,
                "query": {
                    "query_string": {
                        'query':
                        '(path_parent: ' + parentpath + ' AND filename: ' +
                        pathbasename + ') OR path_parent: ' + newpath +
                        ' OR path_parent: ' + newpathwildcard,
                        'analyze_wildcard':
                        'true'
                    }
                },
                "aggs": {
                    "total_size": {
                        "sum": {
                            "field": "filesize"
                        }
                    },
                    "total_files": {
                        "sum": {
                            "field": "items_files"
                        }
                    },
                    "total_subdirs": {
                        "sum": {
                            "field": "items_subdirs"
                        }
                    }
                }
            }

        # search ES and start scroll for all directory doc search (subdirs)
        res = es.search(index=cliargs['index'],
                        doc_type='directory',
                        body=data,
                        request_timeout=config['es_timeout'])

        # total file size sum
        totalsize = res['aggregations']['total_size']['value']

        # total items sum for all subdirs count
        totalitems_subdirs = res['aggregations']['total_subdirs']['value']

        # total items sum for all files count
        totalitems_files = res['aggregations']['total_files']['value']

        totalitems += totalitems_subdirs + totalitems_files

        # update filesize and items fields for directory (path) doc
        d = {
            '_op_type': 'update',
            '_index': cliargs['index'],
            '_type': 'directory',
            '_id': path[0],
            'doc': {
                'filesize': totalsize,
                'items': totalitems,
                'items_files': totalitems_files,
                'items_subdirs': totalitems_subdirs
            }
        }
        doclist.append(d)

    index_bulk_add(es, doclist, config, cliargs)