def index_document(file_hash, file_path): if not ES.exists(index=DOCUMENTS_INDEX, id=file_hash): file_name, file_extension = get_file_name_and_extension(file_path) doc = { 'name': file_name, 'extension': file_extension, } ES.index(index=DOCUMENTS_INDEX, body=doc, id=file_hash)
def document_exist(path): body = {"query": {"match": {"path": path}}} r = ES.search(body=body, index=DOCUMENTS_INDEX) exists = r['hits']['total']['value'] > 0 for doc in r['hits']['hits']: ES.update(index=DOCUMENTS_INDEX, id=doc['_id'], body={"doc": { "exists": True }}) return exists
def index_document(path): if not document_exist(path): file_hash = SHA256.hash_file(path) file_name, extension = get_file_name_and_extension(path) doc = { 'path': path, 'name': file_name, 'extension': extension, 'hash': file_hash, 'size': os.stat(path).st_size, 'timestamp': datetime.now(), 'exists': True } ES.index(index=DOCUMENTS_INDEX, body=doc)
def remove_document_from_index(path): body = { "query": { "term": { "path": { "value": path, "boost": 1.0 } } } } try: doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0] ES.delete(id=doc['_id'], index=DOCUMENTS_INDEX) except: logging.error("Can't remove {}".format(path))
def delete_document_copies(hash): body = { "query": { "term": { "hash": { "value": hash, "boost": 1.0 } } }, "sort": [ {"timestamp": {"order": "asc"}} ] } all_copies = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'] for copy in all_copies: assert Path(copy['_source']['path']).exists() result = [] if len(all_copies) > 1: for copy in all_copies[1:]: path = copy['_source']['path'] try: os.remove(path) remove_document_from_index(path) result.append(path) except: logging.error("Can't delete file {}".format(path)) return result
def search_documents(query): body = { "query": { "multi_match": { "query": query, "fields": ["name^5", "content", "path"] } }, "size": 1000 } return ES.search(body=body, index=DOCUMENTS_INDEX)
def get_all_missing_documents(index=DOCUMENTS_INDEX, scroll_timeout="2m", **kwargs): is_first = True body = {"query": {"term": {"exists": {"value": False, "boost": 1.0}}}} while True: if is_first: result = ES.search(index=index, scroll=scroll_timeout, **kwargs, body=body) is_first = False else: result = ES.scroll(body={ "scroll_id": scroll_id, "scroll": scroll_timeout }) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] if not hits: break yield from (hit for hit in hits)
def get_all_documents(pagesize=250, index=DOCUMENTS_INDEX, scroll_timeout="2m", **kwargs): is_first = True while True: if is_first: # Initialize scroll result = ES.search(index=index, scroll=scroll_timeout, **kwargs, body={"size": pagesize}) is_first = False else: result = ES.scroll(body={ "scroll_id": scroll_id, "scroll": scroll_timeout }) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] if not hits: break yield from (hit for hit in hits)
def get_stats_extensions(): body = { "aggs": { "extensions": { "terms": { "field": "extension", "size": 1000 } } }, "size": 0 } return ES.search(body=body, index=DOCUMENTS_INDEX)
def get_similar_documents(file_hash): body = { "query": { "term": { "hash": { "value": file_hash, "boost": 1.0 } } } } doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0] original_content = doc['_source']['content'] original_name = doc['_source']['name'] original_hash = doc['_source']['hash'] if len(original_content) > 0: body = { "query": { "match": { "content": { "query": " ".join(sorted(original_content.split()[:500])) } } }, "sort": [ "_score" ] } match = ES.search(body=body, index=DOCUMENTS_INDEX) score_max = match['hits']['max_score'] score_threshold = score_max - (score_max / 100 * 5) results = {original_hash: original_name} for r in match['hits']['hits']: if r['_score'] > score_threshold: results[r['_source']['hash']] = r['_source']['name'] if len(results) > 1: ES.index(index=DUPLICATES_INDEX, body=results)
def library_size(): body = { "query": { "match_all": {} }, "size": 0, "aggs": { "library_size": { "sum": { "field": "size" } } } } return ES.search(body=body, index=DOCUMENTS_INDEX)
def get_duplicate_files(): body = { "size": 0, "aggs": { "duplicate_hashes": { "terms": { "field": "hash", "size": 5000, "min_doc_count": 2 } } } } hashes = ES.search(body=body, index=DOCUMENTS_INDEX)['aggregations']['duplicate_hashes']['buckets'] result = [] for h in hashes: result.append(h['key']) return result
def index_document_metadata(id, path): try: file_meta = get_tika_meta(path) except: file_meta = {} ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"meta": file_meta}})
def reset_exists(): for doc in get_all_documents(): ES.update(index=DOCUMENTS_INDEX, id=doc['_id'], body={"doc": {"exists": False}})
def index_document_content(id, path): try: file_content = get_tika_content(path) except: file_content = "" ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"content": file_content}})
def get_document(id): return ES.get(index=DOCUMENTS_INDEX, id=id)