def old_dates_update(): """ This function updates data by pulling 5 old dates from the past year. When run once each day, the entire past year will update every 73 days. """ # pick 5 oldest dates from past year date_sql = """ SELECT * FROM doi_queue_paperbuzz_dates WHERE id > now() - interval '1 year' ORDER BY finished ASC NULLS FIRST LIMIT 5; """ dates = db.session.execute(date_sql) # set values to null so they are picked up by importer for date in dates: id_date = date[0] run_sql( db, """update doi_queue_paperbuzz_dates set enqueued=NULL, finished=NULL, started=NULL, dyno=NULL where id = '{id_date}'""".format(id_date=id_date), ) # run update script os.system("python doi_queue.py --dates --run")
def kick(job_type): q = """update {table_name} set started=null, finished=null where finished is null and id in (select id from {table_name} where started is not null)""".format( table_name=table_name(job_type) ) run_sql(db, q) print_status(job_type)
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 1) queue_table = "doi_queue_dates" run_class = DateRange run_method = "scroll_through_all_dois" if not single_obj_id: text_query_pattern = """WITH picked_from_queue AS ( SELECT * FROM {queue_table} WHERE started is null ORDER BY rand LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM picked_from_queue WHERE picked_from_queue.id = queue_rows_to_update.id RETURNING picked_from_queue.*;""" text_query = text_query_pattern.format( chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: objects = [run_class.query.filter(run_class.id == single_obj_id).first()] else: # logger.info(u"looking for new jobs") objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) if object_ids: object_ids_str = u",".join([u"'{}'".format(str(id).replace(u"'", u"''")) for id in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping run_sql(db, u"update {queue_table} set finished=now() where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str)) # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, chunk, start_time, index)
def process_failed_import(self): run_sql( db, """update doi_queue_paperbuzz_dates set enqueued=NULL, finished=NULL, started=NULL, dyno=NULL where id = '{id_date}'""".format( id_date=self.id), ) raise requests.exceptions.ReadTimeout
def worker_run(self, **kwargs): run_class = PdfUrl single_url = kwargs.get("id", None) chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) if limit is None: limit = float("inf") if single_url: objects = [run_class.query.filter(run_class.url == single_url).first()] check_pdf_urls(objects) else: index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size) if not objects: sleep(5) continue check_pdf_urls(objects) object_ids = [obj.url for obj in objects] object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") # sql escaping sql_command = u""" update {queue_table} q set finished = now(), started = null, retry_interval = least( case when is_pdf then '2 hours' else 4 * coalesce(retry_interval, '2 hours') end, '2 months' ), retry_at = now() + case when is_pdf then '2 months' else coalesce(retry_interval, '2 hours') end from pdf_url where pdf_url.url = q.url and q.url in ({ids}) """.format( queue_table=self.table_name(None), ids=object_ids_str ) run_sql(db, sql_command) index += 1 num_updated += len(objects) self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def add_dois_to_queue_from_file(filename, job_type): start = time() command = """psql `heroku config:get DATABASE_URL`?ssl=true -c "\copy {table_name} (id) FROM '{filename}' WITH CSV DELIMITER E'|';" """.format( table_name=table_name(job_type), filename=filename) call(command, shell=True) q = "update {} set id=lower(id)".format(table_name(job_type)) run_sql(db, q) logger.info(u"add_dois_to_queue_from_file done in {} seconds".format(elapsed(start, 1))) print_status(job_type)
def worker_run(self, **kwargs): run_class = Pub single_obj_id = kwargs.get("id", None) chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) if limit is None: limit = float("inf") if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [run_class.query.filter(run_class.id == single_obj_id).first()] extract_pub_pdf_urls(objects) else: index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size) if not objects: sleep(5) continue object_ids = [obj.id for obj in objects] extract_pub_pdf_urls(objects) object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") # sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=self.table_name(None), ids=object_ids_str ) run_sql(db, sql_command) index += 1 num_updated += len(objects) self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def worker_run(self, **kwargs): run_class = PdfUrl single_url = kwargs.get("id", None) chunk_size = kwargs.get("chunk", 100) limit = kwargs.get("limit", None) if limit is None: limit = float("inf") if single_url: objects = [run_class.query.filter(run_class.url == single_url).first()] check_pdf_urls(objects) else: index = 0 num_updated = 0 start_time = time() while num_updated < limit: new_loop_start_time = time() objects = self.fetch_queue_chunk(chunk_size) if not objects: sleep(5) continue check_pdf_urls(objects) object_ids = [obj.url for obj in objects] object_ids_str = u",".join([u"'{}'".format(oid.replace(u"'", u"''")) for oid in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") # sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where url in ({ids})".format( queue_table=self.table_name(None), ids=object_ids_str ) run_sql(db, sql_command) index += 1 num_updated += len(objects) self.print_update(new_loop_start_time, chunk_size, limit, start_time, index)
def add_dois_to_queue_from_query(where, job_type): logger.info(u"adding all dois, this may take a while") start = time() table_name = "doi_queue" # run_sql(db, "drop table {} cascade".format(table_name(job_type))) # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format( # table_name(job_type)) create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format( table_name) if where: create_table_command = create_table_command.replace( "from pub)", "from pub where {})".format(where)) run_sql(db, create_table_command) create_table_command += """ alter table {table_name} alter column rand set default random(); CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id); CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null; CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null; -- from https://lob.com/blog/supercharge-your-postgresql-performance -- vacuums and analyzes every ten million rows ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000); ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000); """.format(table_name=table_name) for command in create_table_command.split(";"): run_sql(db, command) command = """create or replace view export_queue as SELECT id AS doi, updated AS updated, response_jsonb->>'evidence' AS evidence, response_jsonb->>'oa_status' AS oa_color, response_jsonb->>'free_fulltext_url' AS best_open_url, response_jsonb->>'year' AS year, response_jsonb->>'found_hybrid' AS found_hybrid, response_jsonb->>'found_green' AS found_green, response_jsonb->>'error' AS error, response_jsonb->>'is_boai_license' AS is_boai_license, replace(api->'_source'->>'journal', ' ', '') AS journal, replace(api->'_source'->>'publisher', ' ', '') AS publisher, api->'_source'->>'title' AS title, api->'_source'->>'subject' AS subject, response_jsonb->>'license' AS license FROM pub where id in (select id from {table_name})""".format( table_name=table_name(job_type)) # if job_type: # command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid") run_sql(db, command) # they are already lowercased logger.info(u"add_dois_to_queue_from_query done in {} seconds".format( elapsed(start, 1))) print_status(job_type)
def daily_update(): # get date we want to add to database today = datetime.today() new_date = (today - timedelta(days=1)).strftime("%Y-%m-%d") # check if date already exists, if not then add it ts = new_date + " 00:00:00.000000" existing_record = db.session.execute( "select * from doi_queue_paperbuzz_dates where id = :val", {"val": ts} ) if existing_record.first() is None: q = """insert into doi_queue_paperbuzz_dates (select s as id, random() as rand, false as enqueued, null::timestamp as finished, null::timestamp as started, null::text as dyno FROM generate_series ('{start}'::timestamp, '{end}'::timestamp, '1 day'::interval) s);""".format( start=new_date, end=new_date ) run_sql(db, q) # run update script os.system("python doi_queue.py --dates --run")
def add_dois_to_queue_from_query(where, job_type): logger.info(u"adding all dois, this may take a while") start = time() table_name = "doi_queue" # run_sql(db, "drop table {} cascade".format(table_name(job_type))) # create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started, null::text as dyno from crossref)".format( # table_name(job_type)) create_table_command = "CREATE TABLE {} as (select id, random() as rand, null::timestamp as finished, null::timestamp as started from pub);".format( table_name) if where: create_table_command = create_table_command.replace("from pub)", "from pub where {})".format(where)) run_sql(db, create_table_command) create_table_command += """ alter table {table_name} alter column rand set default random(); CREATE INDEX {table_name}_id_idx ON {table_name} USING btree (id); CREATE INDEX {table_name}_finished_null_rand_idx on {table_name} (rand) where finished is null; CREATE INDEX {table_name}_started_null_rand_idx ON {table_name} USING btree (rand, started) WHERE started is null; -- from https://lob.com/blog/supercharge-your-postgresql-performance -- vacuums and analyzes every ten million rows ALTER TABLE {table_name} SET (autovacuum_vacuum_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_vacuum_threshold = 10000000); ALTER TABLE {table_name} SET (autovacuum_analyze_scale_factor = 0.0); ALTER TABLE {table_name} SET (autovacuum_analyze_threshold = 10000000); """.format( table_name=table_name) for command in create_table_command.split(";"): run_sql(db, command) command = """create or replace view export_queue as SELECT id AS doi, updated AS updated, response_jsonb->>'evidence' AS evidence, response_jsonb->>'oa_status' AS oa_color, response_jsonb->>'free_fulltext_url' AS best_open_url, response_jsonb->>'year' AS year, response_jsonb->>'found_hybrid' AS found_hybrid, response_jsonb->>'found_green' AS found_green, response_jsonb->>'error' AS error, response_jsonb->>'is_boai_license' AS is_boai_license, replace(api->'_source'->>'journal', ' ', '') AS journal, replace(api->'_source'->>'publisher', ' ', '') AS publisher, api->'_source'->>'title' AS title, api->'_source'->>'subject' AS subject, response_jsonb->>'license' AS license FROM pub where id in (select id from {table_name})""".format( table_name=table_name(job_type)) # if job_type: # command_with_hybrid = command.replace("response_jsonb", "response_with_hybrid").replace("export_queue", "export_queue_with_hybrid") run_sql(db, command) # they are already lowercased logger.info(u"add_dois_to_queue_from_query done in {} seconds".format(elapsed(start, 1))) print_status(job_type)
def reset_enqueued(self, job_type): q = u"update {} set started=null, finished=null".format( self.table_name(job_type)) run_sql(db, q)
def kick(self, job_type): q = u"""update {table_name} set started=null, finished=null where finished is null""".format( table_name=self.table_name(job_type)) run_sql(db, q) self.print_status(job_type)
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 1) queue_table = "doi_queue_dates" run_class = DateRange run_method = "scroll_through_all_dois" if not single_obj_id: text_query_pattern = """WITH picked_from_queue AS ( SELECT * FROM {queue_table} WHERE started is null ORDER BY rand LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM picked_from_queue WHERE picked_from_queue.id = queue_rows_to_update.id RETURNING picked_from_queue.*;""" text_query = text_query_pattern.format(chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: objects = [ run_class.query.filter( run_class.id == single_obj_id).first() ] else: # logger.info(u"looking for new jobs") objects = run_class.query.from_statement( text(text_query)).execution_options(autocommit=True).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) if object_ids: object_ids_str = u",".join([ u"'{}'".format(str(id).replace(u"'", u"''")) for id in object_ids ]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping run_sql( db, u"update {queue_table} set finished=now() where id in ({ids})" .format(queue_table=queue_table, ids=object_ids_str)) # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, chunk, start_time, index)
def reset_enqueued(job_type): q = u"update {} set started=null, finished=null".format(table_name(job_type)) run_sql(db, q)
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 10) limit = kwargs.get("limit", 10) queue_table = "pmh_record" run_class = PmhRecord run_method = "mint_pages" if single_obj_id: limit = 1 else: if not limit: limit = 1000 text_query_pattern = """WITH picked_from_queue AS ( SELECT * --FROM {queue_table} from (select * from pmh_record where repo_id='digitallibrary.amnh.org/oai/request') s WHERE started is null -- ORDER BY rand LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM picked_from_queue WHERE picked_from_queue.id = queue_rows_to_update.id RETURNING picked_from_queue.*;""" text_query = text_query_pattern.format(limit=limit, chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: objects = [ run_class.query.filter( run_class.id == single_obj_id).first() ] else: # logger.info(u"looking for new jobs") objects = run_class.query.from_statement( text(text_query)).execution_options(autocommit=True).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(new_loop_start_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) object_ids_str = u",".join( [u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping run_sql( db, u"update {queue_table} set finished=now() where id in ({ids})". format(queue_table=queue_table, ids=object_ids_str)) # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 100) limit = kwargs.get("limit", 10) run_class = Pub run_method = kwargs.get("method") if single_obj_id: limit = 1 queue_table = None elif run_method == "refresh": queue_table = "pub_refresh_queue" if not limit: limit = 1000 text_query_pattern = """ with refresh_queue as ( select id from {queue_table} where started is null order by priority desc, finished nulls first, started, rand limit {chunk} for update skip locked ) update {queue_table} queue_rows_to_update set started = now() from refresh_queue where refresh_queue.id = queue_rows_to_update.id returning refresh_queue.id;""" text_query = text_query_pattern.format( chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) else: queue_table = "pub_queue" if not limit: limit = 1000 text_query_pattern = """WITH update_pub_queue AS ( SELECT id FROM {queue_table} WHERE started is null order by finished asc nulls first LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM update_pub_queue WHERE update_pub_queue.id = queue_rows_to_update.id RETURNING update_pub_queue.id;""" text_query = text_query_pattern.format( limit=limit, chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [run_class.query.filter(run_class.id == single_obj_id).first()] else: logger.info(u"looking for new jobs") job_time = time() row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info(u"got ids, took {} seconds".format(elapsed(job_time))) job_time = time() q = db.session.query(Pub).options(orm.undefer('*')).filter(Pub.id.in_(object_ids)) objects = q.all() logger.info(u"got pub objects in {} seconds".format(elapsed(job_time))) # shuffle them or they sort by doi order random.shuffle(objects) # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # id_rows = db.engine.execute(text(text_query)).fetchall() # ids = [row[0] for row in id_rows] # # job_time = time() # objects = run_class.query.filter(run_class.id.in_(ids)).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) # logger.info(u"finished update_fn") if queue_table: object_ids_str = u",".join([u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str) # logger.info(u"sql command to update finished is: {}".format(sql_command)) run_sql(db, sql_command) # logger.info(u"finished run_sql") # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def truncate(job_type): q = "truncate table {}".format(table_name(job_type)) run_sql(db, q)
def kick(job_type): q = u"""update {table_name} set started=null, finished=null where finished is null""".format( table_name=table_name(job_type)) run_sql(db, q) print_status(job_type)
def truncate(self, job_type): q = "truncate table {}".format(self.table_name(job_type)) run_sql(db, q)
def worker_run(self, **kwargs): single_obj_id = kwargs.get("id", None) chunk = kwargs.get("chunk", 100) limit = kwargs.get("limit", 10) run_class = Pub run_method = kwargs.get("method") if single_obj_id: limit = 1 queue_table = None elif run_method == "refresh": queue_table = "pub_refresh_queue" if not limit: limit = 1000 text_query_pattern = """ with refresh_queue as ( select id from {queue_table} where started is null order by priority desc, finished nulls first, started, rand limit {chunk} for update skip locked ) update {queue_table} queue_rows_to_update set started = now() from refresh_queue where refresh_queue.id = queue_rows_to_update.id returning refresh_queue.id;""" text_query = text_query_pattern.format(chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) else: queue_table = "pub_queue" if not limit: limit = 1000 text_query_pattern = """WITH update_pub_queue AS ( SELECT id FROM {queue_table} WHERE started is null order by finished asc nulls first LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} queue_rows_to_update SET started=now() FROM update_pub_queue WHERE update_pub_queue.id = queue_rows_to_update.id RETURNING update_pub_queue.id;""" text_query = text_query_pattern.format(limit=limit, chunk=chunk, queue_table=queue_table) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: single_obj_id = clean_doi(single_obj_id) objects = [ run_class.query.filter( run_class.id == single_obj_id).first() ] else: logger.info(u"looking for new jobs") job_time = time() row_list = db.engine.execute( text(text_query).execution_options( autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] logger.info(u"got ids, took {} seconds".format( elapsed(job_time))) job_time = time() q = db.session.query(Pub).options(orm.undefer('*')).filter( Pub.id.in_(object_ids)) objects = q.all() logger.info(u"got pub objects in {} seconds".format( elapsed(job_time))) # shuffle them or they sort by doi order random.shuffle(objects) # objects = Pub.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # objects = run_class.query.from_statement(text(text_query)).execution_options(autocommit=True).all() # id_rows = db.engine.execute(text(text_query)).fetchall() # ids = [row[0] for row in id_rows] # # job_time = time() # objects = run_class.query.filter(run_class.id.in_(ids)).all() # logger.info(u"finished get-new-objects query in {} seconds".format(elapsed(job_time))) if not objects: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue object_ids = [obj.id for obj in objects] self.update_fn(run_class, run_method, objects, index=index) # logger.info(u"finished update_fn") if queue_table: object_ids_str = u",".join([ u"'{}'".format(id.replace(u"'", u"''")) for id in object_ids ]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping sql_command = u"update {queue_table} set finished=now(), started=null where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str) # logger.info(u"sql command to update finished is: {}".format(sql_command)) run_sql(db, sql_command) # logger.info(u"finished run_sql") # finished is set in update_fn index += 1 if single_obj_id: return else: self.print_update(new_loop_start_time, chunk, limit, start_time, index)
def run(self, **kwargs): single_obj_id = kwargs.get("id", None) limit = kwargs.get("limit", 0) chunk = kwargs.get("chunk", self.chunk) after = kwargs.get("after", None) queue_table = "doi_queue" if single_obj_id: limit = 1 else: if not limit: limit = 1000 ## based on http://dba.stackexchange.com/a/69497 if self.action_table == "base": text_query_pattern = """WITH selected AS ( SELECT * FROM {table} WHERE queue != '{queue_name}' and {where} LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {table} records_to_update SET queue='{queue_name}' FROM selected WHERE selected.id = records_to_update.id RETURNING records_to_update.id;""" text_query = text_query_pattern.format( table=self.action_table, where=self.where, chunk=chunk, queue_name=self.queue_name) else: my_dyno_name = os.getenv("DYNO", "unknown") if kwargs.get("hybrid", False) or "hybrid" in my_dyno_name: queue_table += "_with_hybrid" elif kwargs.get("dates", False) or "dates" in my_dyno_name: queue_table += "_dates" text_query_pattern = """WITH picked_from_queue AS ( SELECT * FROM {queue_table} WHERE started is null ORDER BY rand LIMIT {chunk} FOR UPDATE SKIP LOCKED ) UPDATE {queue_table} doi_queue_rows_to_update SET started=now() FROM picked_from_queue WHERE picked_from_queue.id = doi_queue_rows_to_update.id RETURNING doi_queue_rows_to_update.id;""" text_query = text_query_pattern.format( chunk=chunk, queue_table=queue_table ) logger.info(u"the queue query is:\n{}".format(text_query)) index = 0 start_time = time() while True: new_loop_start_time = time() if single_obj_id: object_ids = [single_obj_id] else: # logger.info(u"looking for new jobs") row_list = db.engine.execute(text(text_query).execution_options(autocommit=True)).fetchall() object_ids = [row[0] for row in row_list] # logger.info(u"finished get-new-ids query in {} seconds".format(elapsed(new_loop_start_time))) if not object_ids: # logger.info(u"sleeping for 5 seconds, then going again") sleep(5) continue update_fn_args = [self.cls, self.method, object_ids] shortcut_data = None if self.shortcut_fn_per_chunk: shortcut_data_start = time() logger.info(u"Getting shortcut data...") shortcut_data = self.shortcut_fn_per_chunk() logger.info(u"Got shortcut data in {} seconds".format( elapsed(shortcut_data_start))) update_fn(*update_fn_args, index=index, shortcut_data=shortcut_data) try: ids_escaped = [id.replace(u"'", u"''") for id in object_ids] except TypeError: ids_escaped = object_ids object_ids_str = u",".join([u"'{}'".format(id) for id in ids_escaped]) object_ids_str = object_ids_str.replace(u"%", u"%%") #sql escaping run_sql(db, u"update {queue_table} set finished=now() where id in ({ids})".format( queue_table=queue_table, ids=object_ids_str)) index += 1 if single_obj_id: return else: num_items = limit #let's say have to do the full limit num_jobs_remaining = num_items - (index * chunk) try: jobs_per_hour_this_chunk = chunk / float(elapsed(new_loop_start_time) / 3600) predicted_mins_to_finish = round( (num_jobs_remaining / float(jobs_per_hour_this_chunk)) * 60, 1 ) logger.info(u"\n\nWe're doing {} jobs per hour. At this rate, if we had to do everything up to limit, done in {}min".format( int(jobs_per_hour_this_chunk), predicted_mins_to_finish )) logger.info(u"\t{} seconds this loop, {} chunks in {} seconds, {} seconds/chunk average\n".format( elapsed(new_loop_start_time), index, elapsed(start_time), round(elapsed(start_time)/float(index), 1) )) except ZeroDivisionError: # logger.info(u"not printing status because divide by zero") logger.info(u"."),