def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None): starttime = time.time() docs = dirlist + filelist index_bulk_add(es, docs, config, cliargs) data = {"worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6), "crawl_time": round(totalcrawltime, 6), "indexing_date": datetime.utcnow().isoformat()} es.index(index=cliargs['index'], doc_type='worker', body=data)
def es_bulk_add(worker_name, dirlist, filelist, cliargs, totalcrawltime=None): if cliargs['chunkfiles']: updated_dirlist = [] # check for existing directory docs in index and update crawl time only (dirchunk) for d in dirlist: try: path = d[ 'chunkpath'] # this key determins if its part of a chunked dir crawltime = d['crawl_time'] f = os.path.basename(path) # parent path p = os.path.abspath(os.path.join(path, os.pardir)) data = { "size": 1, "_source": ['crawl_time'], "query": { "query_string": { "query": "filename: \"" + f + "\" AND path_parent: \"" + p + "\"" } } } es.indices.refresh(index=cliargs['index']) res = es.search(index=cliargs['index'], doc_type='directory', body=data, request_timeout=config['es_timeout']) if len(res['hits']['hits']) == 0: continue docid = res['hits']['hits'][0]['_id'] current_crawltime = res['hits']['hits'][0]['_source'][ 'crawl_time'] udpated_crawltime = current_crawltime + crawltime # update crawltime in index d = { '_op_type': 'update', '_index': cliargs['index'], '_type': 'directory', '_id': docid, 'doc': { 'crawl_time': udpated_crawltime } } except KeyError: pass # not part of a chunked dir updated_dirlist.append(d) dirlist = updated_dirlist starttime = time.time() docs = dirlist + filelist index_bulk_add(es, docs, config, cliargs) if not cliargs['noworkerdocs']: data = { "worker_name": worker_name, "dir_count": len(dirlist), "file_count": len(filelist), "bulk_time": round(time.time() - starttime, 6), "crawl_time": round(totalcrawltime, 6), "indexing_date": datetime.utcnow().isoformat() } es.index(index=cliargs['index'], doc_type='worker', body=data)