def import_json(self, in_lines, website_id: int): import_every = 10000 cooldown_time = 0 docs = [] for line in in_lines: try: doc = ujson.loads(line) name, ext = os.path.splitext(doc["name"]) doc["ext"] = ext[1:].lower() if ext and len(ext) > 1 else "" doc["name"] = name doc["website_id"] = website_id docs.append(doc) except Exception as e: logger.error("Error in import_json: " + str(e) + " for line : + \n" + line) if len(docs) >= import_every: self._index(docs) docs.clear() time.sleep(cooldown_time) if docs: self._index(docs)
def delete_docs(self, website_id): while True: try: logger.debug("Deleting docs of " + str(website_id)) to_delete = helpers.scan(query={ "query": { "term": { "website_id": website_id } } }, scroll="1m", client=self.es, index=self.index_name, request_timeout=120, routing=website_id) buf = [] counter = 0 for doc in to_delete: buf.append(doc) counter += 1 if counter >= 10000: self._delete(buf, website_id) buf.clear() counter = 0 if counter > 0: self._delete(buf, website_id) break except Exception as e: logger.error("During delete: " + str(e)) time.sleep(10) logger.debug("Done deleting for " + str(website_id))
def _delete(self, docs, website_id): bulk_string = self.create_bulk_delete_string(docs) result = self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, routing=website_id) if result["errors"]: logger.error("Error in ES bulk delete: \n" + result["errors"]) raise IndexingError
def _index(self, docs): while True: try: logger.debug("Indexing " + str(len(docs)) + " docs") bulk_string = ElasticSearchEngine.create_bulk_index_string(docs) self.es.bulk(body=bulk_string, index=self.index_name, doc_type="file", request_timeout=30, routing=docs[0]["website_id"]) break except Exception as e: logger.error("Error in _index: " + str(e) + ", retrying") time.sleep(10)