def scroll(cls, q=None, page_size=1000, limit=None, keepalive="1m", conn=None, raise_on_scroll_error=True, types=None, wrap=True): if conn is None: conn = cls.__conn__ types = cls.get_read_types(types) if q is None: q = {"query": {"match_all": {}}} gen = tasks.scroll(conn, types, q, page_size=page_size, limit=limit, keepalive=keepalive) try: for o in gen: if wrap: yield cls(o) else: yield o except tasks.ScrollException as e: if raise_on_scroll_error: raise e else: return
def scroll_edit(connection, es_type, query): """ Use a scroll search to update a field based on a given query """ write_batch = [] batch_size = 200 for a in tasks.scroll(connection, type=es_type, q=query): # Substitute the text and add to the write batch d = match_hindawi_urls.sub(replacement_text, json.dumps(a)) write_batch.append(json.loads(d)) # When we have enough, do some writing if len(write_batch) >= batch_size: print "writing ", len(write_batch) raw.bulk(connection, es_type, write_batch) write_batch = [] # Write the last part-batch to index if len(write_batch) > 0: print "writing ", len(write_batch) raw.bulk(connection, es_type, write_batch)
def scroll_edit(connection, es_type, query): """ Use a scroll search to update a field based on a given query """ write_batch = [] batch_size = 200 for a in tasks.scroll(connection, type=es_type, q=query): # Substitute the text and add to the write batch d = match_hindawi_urls.sub(replacement_text, json.dumps(a)) write_batch.append(json.loads(d)) # When we have enough, do some writing if len(write_batch) >= batch_size: print("writing ", len(write_batch)) raw.bulk(connection, es_type, write_batch) write_batch = [] # Write the last part-batch to index if len(write_batch) > 0: print("writing ", len(write_batch)) raw.bulk(connection, es_type, write_batch)
} } } # Connection to the ES index conn = raw.make_connection(None, 'localhost', 9200, 'doaj') # Edit the Journals write_batch = [] batch_size = 1000 for article_issn in missed_articles: query['query']['query_string']['query'] = article_issn for a in tasks.scroll(conn, 'article', query): try: article_model = models.Article(_source=a) a_license = article_model.data.get('index')['license'] # Change the license article_model.data.get('index')['license'] = [license_correct_dict[a_license[0]]] write_batch.append(article_model.data) except ValueError: print "Failed to create a model" except KeyError: print "No license to change" # When we have enough, do some writing if len(write_batch) >= batch_size: print "writing ", len(write_batch) models.Article.bulk(write_batch)
} } } # Connection to the ES index conn = raw.make_connection(None, 'localhost', 9200, 'doaj') # Edit the Journals write_batch = [] batch_size = 1000 for article_issn in missed_articles: query['query']['query_string']['query'] = article_issn for a in tasks.scroll(conn, 'article', query): try: article_model = models.Article(_source=a) a_license = article_model.data.get('index')['license'] # Change the license article_model.data.get('index')['license'] = [ license_correct_dict[a_license[0]] ] write_batch.append(article_model.data) except ValueError: print "Failed to create a model" except KeyError: print "No license to change" # When we have enough, do some writing if len(write_batch) >= batch_size:
write_batch = [] batch_size = 1000 edited = 0 failed = 0 unchanged = 0 nolicence = 0 ed = [] fa = [] un = [] nl = [] # Process the previous set of journals for j in tasks.scroll(conn, 'journal'): try: journal_model = models.Journal(_source=j) # Change the license j_license = journal_model.bibjson().get_license() if j_license: j_license['type'] = license_correct_dict[j_license['type']] j_license['title'] = license_correct_dict[j_license['title']] print "edited\t{0}".format(journal_model.id) ed.append(journal_model.id) edited += 1 journal_model.prep() write_batch.append(journal_model.data) else: nolicence += 1 print "no licence\t{0}".format(journal_model.id)