def socket_thread_handler_twc(threadnum, q, q_kill, lock, rootdir, num_sep, level, batchsize, cliargs, logger, reindex_dict): """This is the socket thread handler tree walk client function. Stream of directory listings (pickle) from diskover treewalk client connections are enqueued to redis rq queue. """ while True: try: c = q.get() clientsock, addr = c logger.debug(clientsock) logger.debug(addr) totalfiles = 0 while True: data = recv_one_message(clientsock) if not data: break if data == b'SIGKILL' or data == 'SIGKILL': q_kill.put(b'SIGKILL') break # unpickle data sent from client data_decoded = pickle.loads(data) logger.debug(data_decoded) # enqueue to redis batch = [] for root, dirs, files in data_decoded: files_len = len(files) totalfiles += files_len # check for empty dirs if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize or ( cliargs['adaptivebatch'] and totalfiles >= config['adaptivebatch_maxfiles']): q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) if cliargs['debug'] or cliargs['verbose']: logger.info( "enqueued batchsize: %s (batchsize: %s)" % (batch_len, batchsize)) del batch[:] totalfiles = 0 if cliargs['adaptivebatch']: batchsize = adaptive_batch(q_crawl, cliargs, batchsize) if cliargs['debug'] or cliargs['verbose']: logger.info("batchsize set to: %s" % batchsize) if len(batch) > 0: # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, ), result_ttl=config['redis_ttl']) del batch[:] # close connection to client clientsock.close() logger.info("[thread-%s]: %s closed connection" % (threadnum, str(addr))) q.task_done() except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def scrape_tree_meta(paths, cliargs, reindex_dict): try: global worker tree_dirs = [] tree_files = [] totalcrawltime = 0 statsembeded = False num_workers = len(SimpleWorker.all(connection=redis_conn)) path_count = 0 filenames = [] for path in paths: path_count += 1 starttime = time.time() if not cliargs['dirsonly']: root, dirs, files = path else: root, dirs = path files = [] if path_count == 1: if type(root) is tuple: statsembeded = True # check if stats embeded in data from diskover tree walk client or crawlapi if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta: # no files in batch, get them with scandir if cliargs['dirsonly']: for entry in scandir(root): if entry.is_file( follow_symlinks=False) and not file_excluded( entry.name): files.append(entry.name) filecount = 0 # check if the directory has a ton of files in it and farm out meta collection to other worker bots files_count = len(files) if cliargs['splitfiles'] and files_count >= cliargs[ 'splitfilesnum']: fmetas = [] for filelist in split_list(files, int(files_count / num_workers)): fmetas.append( q_crawl.enqueue(file_meta_collector, args=( filelist, root_path, statsembeded, cliargs, reindex_dict, ), result_ttl=config['redis_ttl'])) n = 0 while n < len(fmetas): if fmetas[n].result: for fmeta in fmetas[n].result: if fmeta: tree_files.append(fmeta) filecount += 1 n += 1 del fmetas[:] else: for file in files: filenames.append(file[0]) if statsembeded: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) else: fmeta = get_file_meta(worker, os.path.join( root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) filecount += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # check for empty dirs and dirsonly cli arg if cliargs['indexemptydirs']: tree_dirs.append(dmeta) elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0): tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) print('%s | processed %d files' % (datetime.now(), len(filenames))) return True, filenames except Exception as e: print('%s | error | %s' % (datetime.now(), e)) return False, []
def socket_thread_handler_twc(threadnum, q, q_kill, rootdir, num_sep, level, batchsize, cliargs, logger, reindex_dict): """This is the socket thread handler tree walk client function. Stream of directory listings (pickle) from diskover treewalk client connections are enqueued to redis rq queue. """ while True: try: c = q.get() clientsock, addr = c logger.debug(clientsock) logger.debug(addr) while True: data = recv_one_message(clientsock) #logger.debug(data) if not data: break if data == b'SIGKILL' or data == 'SIGKILL': q_kill.put(b'SIGKILL') break data_decoded = pickle.loads(data) logger.debug(data_decoded) # enqueue to redis batch = [] for root, dirs, files in data_decoded: if len(dirs) == 0 and len( files) == 0 and not cliargs['indexemptydirs']: continue # check if meta stat data has been embeded in the data from client if type(root) is tuple: rootpath = root[0] else: rootpath = root if not dir_excluded(rootpath, config, cliargs['verbose']): batch.append((root, dirs, files)) batch_len = len(batch) if batch_len >= batchsize: q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] if cliargs['adaptivebatch']: batchsize = adaptive_batch( q_crawl, cliargs, batchsize) # check if at maxdepth level and delete dirs/files lists to not # descend further down the tree num_sep_this = rootpath.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] del files[:] else: # directory excluded del dirs[:] del files[:] if len(batch) > 0: # add any remaining in batch to queue q_crawl.enqueue(scrape_tree_meta, args=( batch, cliargs, reindex_dict, )) del batch[:] # close connection to client clientsock.close() logger.info("[thread-%s]: %s closed connection" % (threadnum, str(addr))) q.task_done() except socket.error as e: logger.error("[thread-%s]: Socket error (%s)" % (threadnum, e))
def scrape_tree_meta(paths, cliargs, reindex_dict): global worker tree_dirs = [] tree_files = [] totalcrawltime = 0 num_workers = len(SimpleWorker.all(connection=redis_conn)) for path in paths: starttime = time.time() root, dirs, files = path # check if dirchunk or stats embeded in data from # diskover tree walk client or crawlapi if type(root) is tuple: if root[1] == 'dchunk': dirchunk = True statsembeded = False else: statsembeded = True dirchunk = False else: statsembeded = False dirchunk = False if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: if dirchunk: root_path = root[0] dmeta = {'chunkpath': root_path} else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta: filecount = 0 # check if the directory has a ton of files in it and farm out meta collection to other worker bots files_count = len(files) if cliargs[ 'splitfiles'] and files_count >= cliargs['splitfilesnum']: fmetas = [] for filelist in split_list(files, int(files_count / num_workers)): fmetas.append( q_crawl.enqueue(file_meta_collector, args=( filelist, root_path, statsembeded, cliargs, reindex_dict, ), result_ttl=config['redis_ttl'])) n = 0 while n < len(fmetas): if fmetas[n].result: for fmeta in fmetas[n].result: if fmeta: tree_files.append(fmeta) filecount += 1 n += 1 else: time.sleep(.05) del fmetas[:] else: for file in files: if statsembeded: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) else: fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) filecount += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # check for empty dirs if cliargs['indexemptydirs']: tree_dirs.append(dmeta) elif not cliargs['indexemptydirs'] and (len(dirs) > 0 or filecount > 0): tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
def scrape_tree_meta(paths, cliargs, reindex_dict): worker = get_worker_name() tree_dirs = [] tree_files = [] if cliargs['qumulo']: qumulo = True from diskover_qumulo import qumulo_get_dir_meta, qumulo_get_file_meta else: qumulo = False totalcrawltime = 0 # check if other bots are idle and throw them some jobs (dir paths) if len(paths) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_crawl) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(paths) n = len(paths) // 2 tosspaths = paths[:n] paths = paths[n:] q_crawl.enqueue(scrape_tree_meta, args=( tosspaths, cliargs, reindex_dict, )) for path in paths: starttime = time.time() root, dirs, files = path totaldirsize = 0 totaldiritems_subdirs = len(dirs) totaldiritems_files = 0 # check if stats embeded in data from diskover tree walk client if type(root) is tuple: statsembeded = True else: statsembeded = False if qumulo: if root['path'] != '/': root_path = root['path'].rstrip(os.path.sep) else: root_path = root['path'] dmeta = qumulo_get_dir_meta(worker, root, cliargs, reindex_dict, redis_conn) else: if statsembeded: root_path = root[0] dmeta = get_dir_meta(worker, root, cliargs, reindex_dict, statsembeded=True) else: root_path = root dmeta = get_dir_meta(worker, root_path, cliargs, reindex_dict, statsembeded=False) if dmeta == "sametimes": # fetch meta data for directory and all it's files (doc sources) from index2 since # directory times haven't changed dir_source, files_source = get_metadata(root_path, cliargs) datenow = datetime.utcnow().isoformat() for file_source in files_source: # update indexed at time file_source['indexing_date'] = datenow # update worker name file_source['worker_name'] = worker tree_files.append(('file', file_source)) if dir_source: # update indexed at time dir_source['indexing_date'] = datenow # update worker name dir_source['worker_name'] = worker # update crawl time elapsed = time.time() - starttime dir_source['crawl_time'] = round(elapsed, 6) tree_dirs.append(dir_source) totalcrawltime += elapsed # get meta off disk since times different in Redis than on disk elif dmeta: # check if meta for files embeded if statsembeded: for file in files: fmeta = get_file_meta(worker, file, cliargs, reindex_dict, statsembeded=True) if fmeta: tree_files.append(fmeta) # add file size to totaldirsize totaldirsize += fmeta['filesize'] totaldiritems_files += 1 else: for file in files: if qumulo: fmeta = qumulo_get_file_meta(worker, file, cliargs, reindex_dict) else: fmeta = get_file_meta(worker, os.path.join(root_path, file), cliargs, reindex_dict, statsembeded=False) if fmeta: tree_files.append(fmeta) # add file size to totaldirsize totaldirsize += fmeta['filesize'] totaldiritems_files += 1 # update crawl time elapsed = time.time() - starttime dmeta['crawl_time'] = round(elapsed, 6) # update directory meta filesize, items dmeta['filesize'] = totaldirsize dmeta['items_files'] = totaldiritems_files dmeta['items_subdirs'] = totaldiritems_subdirs totaldiritems = totaldiritems_files + totaldiritems_subdirs dmeta['items'] += totaldiritems tree_dirs.append(dmeta) totalcrawltime += elapsed # check if doc count is more than es chunksize and bulk add to es if len(tree_dirs) + len(tree_files) >= config['es_chunksize']: td = tree_dirs[:] tf = tree_files[:] es_bulk_add(worker, td, tf, cliargs, totalcrawltime) del tree_dirs[:] del tree_files[:] totalcrawltime = 0 # bulk add to es if len(tree_dirs) > 0 or len(tree_files) > 0: es_bulk_add(worker, tree_dirs, tree_files, cliargs, totalcrawltime)
def calc_dir_size(dirlist, cliargs): """This is the calculate directory size worker function. It gets a directory list from the Queue and searches ES for all subdirs in each directory (recursive) and sums their filesize and items fields to create a total filesize and item count for each directory doc. Updates directory doc's filesize and items fields. """ # check if other bots are idle and throw them some jobs (dir paths) if len(dirlist) >= cliargs['batchsize']: workers_idle = 0 workers = Worker.all(connection=redis_conn) num_workers = len(workers) for w in workers: if w._state == "idle": workers_idle += 1 if workers_idle > num_workers // 2: workers_idle = True break q_len = len(q_calc) if q_len == 0 and workers_idle == True: # take half the paths randomly shuffle(dirlist) n = len(dirlist) // 2 tossdirs = dirlist[:n] dirlist = dirlist[n:] q_crawl.enqueue(calc_dir_size, args=( tossdirs, cliargs, )) doclist = [] for path in dirlist: totalitems = 1 # 1 for itself # file doc search with aggregate for sum filesizes # escape special characters newpath = escape_chars(path[1]) parentpath = escape_chars( os.path.abspath(os.path.join(path[1], os.pardir))) pathbasename = escape_chars(os.path.basename(path[1])) # create wildcard string and check for / (root) path if newpath == '\/': newpathwildcard = '\/*' else: newpathwildcard = newpath + '\/*' # check if / (root) path if newpath == '\/': data = { "size": 0, "query": { "query_string": { "query": "path_parent: " + newpath + "*", "analyze_wildcard": "true" } }, "aggs": { "total_size": { "sum": { "field": "filesize" } } } } else: data = { "size": 0, "query": { "query_string": { 'query': '(path_parent: ' + parentpath + ' AND filename: ' + pathbasename + ') OR path_parent: ' + newpath + ' OR path_parent: ' + newpathwildcard, 'analyze_wildcard': 'true' } }, "aggs": { "total_size": { "sum": { "field": "filesize" } }, "total_files": { "sum": { "field": "items_files" } }, "total_subdirs": { "sum": { "field": "items_subdirs" } } } } # search ES and start scroll for all directory doc search (subdirs) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) # total file size sum totalsize = res['aggregations']['total_size']['value'] # total items sum for all subdirs count totalitems_subdirs = res['aggregations']['total_subdirs']['value'] # total items sum for all files count totalitems_files = res['aggregations']['total_files']['value'] totalitems += totalitems_subdirs + totalitems_files # update filesize and items fields for directory (path) doc d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': path[0], 'doc': { 'filesize': totalsize, 'items': totalitems, 'items_files': totalitems_files, 'items_subdirs': totalitems_subdirs } } doclist.append(d) index_bulk_add(es, doclist, config, cliargs)