def consumer(file): num_to_skip = es_client.get_source(index=STATE_INDEX_NAME, id=file, doc_type=STATE_TYPE)["line"] process_name = multiprocessing.current_process().name print("%s is skipping %d lines in file: %s " % (process_name, num_to_skip, file)) results_g = helpers.streaming_bulk( es_client, actions=(build_es_action(try_to_process(d), es_index_name, es_type, op_type="index") for d in data_io.read_jsonl( file, limit=limit, num_to_skip=num_to_skip)), chunk_size=chunk_size, yield_ok=True, raise_on_error=False, raise_on_exception=False, ) counter = num_to_skip for k, (ok, d) in enumerate(results_g): counter += 1 if not ok and "index" in d: print("shit") if k % 1000 == 0: update_state(file, {"line": counter}) update_state(file, {"line": counter}) if limit is None or counter < limit: update_state(file, {"done": True}) print("%s is done; inserted %d new docs!" % (process_name, counter - num_to_skip))
def populate_es_parallel_bulk( es, files, es_index_name, es_type, limit=None, num_processes=4, chunk_size=500 ): dicts_g = (d for file in files for d in read_jsonl(file, limit=limit)) actions_g = (build_es_action(d, es_index_name, es_type) for d in dicts_g) results_g = helpers.parallel_bulk( es, actions_g, thread_count=num_processes, queue_size=num_processes, chunk_size=chunk_size, raise_on_exception=False, raise_on_error=False, ) failed_g = ( pop_exception(d) for ok, d in tqdm(results_g) if not ok and d.get("create", {}).get("status", 200) != 409 ) data_io.write_jsonl("failed.jsonl", failed_g)
def populate_es_streaming_bulk( es_client: Elasticsearch, dicts: Iterable[Dict], es_index_name: str, es_type: str, chunk_size: int = 500, ): def pop_exception(d): d["index"].pop("exception") return d es_actions_g = (build_es_action(d, index_name=es_index_name, es_type=es_type) for d in dicts) results_g = helpers.streaming_bulk( es_client, es_actions_g, chunk_size=chunk_size, yield_ok=True, raise_on_error=True, ) failed_g = (pop_exception(d) for ok, d in tqdm(results_g) if not ok) data_io.write_jsonl("failed.jsonl", failed_g)
def consumer(file): print("%s is doing %s; limit: %d" % (multiprocessing.current_process(), file, limit)) dicts_g = (d for d in data_io.read_jsonl(file, limit=limit)) actions_g = (build_es_action(d, es_index_name, es_type, op_type="index") for d in dicts_g) results_g = helpers.streaming_bulk( es_client, actions_g, chunk_size=chunk_size, yield_ok=True, raise_on_error=False, raise_on_exception=False, ) failed_g = (pop_exception(d) for ok, d in results_g if not ok) data_io.write_jsonl( "%s_failed.jsonl" % multiprocessing.current_process(), failed_g)
def setup_index(es_client, files: List[str], INDEX_NAME, TYPE, from_scratch=False, mapping=None): STATE_INDEX_NAME = INDEX_NAME + "_state" STATE_TYPE = "file_state" if from_scratch: es_client.indices.delete(index=INDEX_NAME, ignore=[400, 404]) es_client.indices.delete(index=STATE_INDEX_NAME, ignore=[400, 404]) sleep(3) es_client.indices.create(index=INDEX_NAME, ignore=400, body=mapping) es_client.indices.create(index=STATE_INDEX_NAME, ignore=400) sleep(3) def build_es_action(datum, index_name, es_type, op_type="index"): _source = { k: None if isinstance(v, str) and len(v) == 0 else v for k, v in datum.items() } doc = { "_id": datum["file"], "_op_type": op_type, "_index": index_name, "_type": es_type, "_source": _source, } return doc helpers.bulk( es_client, (build_es_action( { "file": file, "line": 0, "done": False }, STATE_INDEX_NAME, STATE_TYPE, op_type="create", ) for file in files), raise_on_error=False, ) sum_in_state = sum([ es_client.get_source(index=STATE_INDEX_NAME, id=file, doc_type=STATE_TYPE)["line"] for file in files ]) if sum_in_state > 0: count = es_client.count(index=INDEX_NAME, doc_type=TYPE)["count"] if sum_in_state != count: print(sum_in_state) print(count) assert False body = ''' { "query": { "bool": { "must": [ {"term": { "done": { "value": "true" } }} ] } } } ''' r = es_client.search(index=STATE_INDEX_NAME, body=body, size=10_000) files_in_es = set( [os.path.split(s['_source']['file'])[1] for s in r['hits']['hits']]) not_yet_in_index_files = [ f for f in files if os.path.split(f)[1] not in files_in_es ] print('got %d files which are not yet in ES-index' % len(not_yet_in_index_files)) return not_yet_in_index_files