def do_link_batch_update_sess(logger, interface, link_batch): if not link_batch: return expected_keys = set([ 'url', 'starturl', 'netloc', 'distance', 'priority', 'state', 'addtime', 'epoch', ]) for item in link_batch: try: assert 'url' in item assert 'starturl' in item assert 'netloc' in item assert 'distance' in item assert 'priority' in item assert 'state' in item assert 'addtime' in item assert 'epoch' in item except AssertionError: logger.error("Missing key from raw entry: ") item_str = pprint.pformat(item) for line in item_str.split("\n"): logger.error(" %s", line.rstrip()) raise item_keys = set(item.keys()) excess_keys = item_keys - expected_keys try: assert not excess_keys except AssertionError: logger.error("Excess key(s) in raw entry: '%s'", excess_keys) item_str = pprint.pformat(item) for line in item_str.split("\n"): logger.error(" %s", line.rstrip()) raise logger.info("Inserting %s items into DB in batch.", len(link_batch)) # This is kind of horrible. # Reach down through sqlalchemy and pull out the raw cursor directly. raw_cur = interface.connection().connection.cursor() per_cmd = """ SELECT upsert_link_raw( %(url)s, %(starturl)s, %(netloc)s, %(distance)s, %(priority)s, %(addtime)s, %(state)s, %(epoch)s ); """.replace(" ", " ") per_cmd = per_cmd.replace("\n", " ") while " " in per_cmd: per_cmd = per_cmd.replace(" ", " ") # Somehow we're getting here with an open transaction. I have no idea what's opening them. # Something something DBAPI raw_cur.execute("COMMIT;") rowcnt = 0 try: for subc in misc.batch(link_batch, 50): # We don't care about isolation for these operations, as each operation # is functionally independent. raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") # We try the bulk insert command first. psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc) rowcnt += raw_cur.rowcount raw_cur.execute("COMMIT;") raw_cur.execute("RESET statement_timeout;") link_batch = [] logger.info("Touched AT LEAST %s rows", rowcnt) return rowcnt except psycopg2.Error: logger.error("psycopg2.Error - Failure on bulk insert.") for line in traceback.format_exc().split("\n"): logger.error(line) raw_cur.execute("ROLLBACK;") logger.error("Retrying.") rowcnt = 0 try: for subc in misc.batch(link_batch, 5): # We don't care about isolation for these operations, as each operation # is functionally independent. raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") # We try the bulk insert command first. psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc) rowcnt += raw_cur.rowcount raw_cur.execute("COMMIT;") raw_cur.execute("RESET statement_timeout;") link_batch = [] logger.info("Touched AT LEAST %s rows", rowcnt) return rowcnt except psycopg2.Error: logger.error("psycopg2.Error - Failure on bulk insert.") for line in traceback.format_exc().split("\n"): logger.error(line) raw_cur.execute("ROLLBACK;") logger.error("Retrying with per upsert commit.") # If the bulk insert failed, we then try a per-URL upsert # We only commit per-URL if we're tried to do per-URL update in batch, and failed. commit_each = False while 1: rowcnt = 0 try: raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") for paramset in link_batch: assert isinstance(paramset['starturl'], str) if len(paramset['url']) > 2000: logger.error("URL Is too long to insert into the database!") logger.error("URL: '%s'", paramset['url']) else: # Forward-data the next walk, time, rather then using now-value for the thresh. raw_cur.execute(per_cmd, paramset) rowcnt += raw_cur.rowcount if commit_each: raw_cur.execute("COMMIT;") raw_cur.execute("BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") raw_cur.execute("COMMIT;") break except psycopg2.Error: if commit_each is False: logger.warning("psycopg2.Error - Retrying with commit each.") else: logger.warning("psycopg2.Error - Retrying.") traceback.print_exc() raw_cur.execute("ROLLBACK;") commit_each = True raw_cur.execute("RESET statement_timeout;") logger.info("Changed %s rows", rowcnt) return
def do_link_batch_update_sess(logger, interface, link_batch, max_pri=None, show_progress=False): if not link_batch: return expected_keys = set([ 'url', 'starturl', 'netloc', 'distance', 'is_text', 'priority', 'type', 'addtime', 'state', 'epoch', 'maximum_priority', # Optional ]) for item in link_batch: try: assert 'url' in item assert 'starturl' in item assert 'netloc' in item assert 'distance' in item assert 'is_text' in item assert 'priority' in item assert 'type' in item assert 'addtime' in item assert 'state' in item assert 'epoch' in item if not 'maximum_priority' in item: item['maximum_priority'] = item['priority'] if item['distance'] < item['maximum_priority']: item['distance'] = item['maximum_priority'] assert 'maximum_priority' in item # psycopg2cffi._impl.exceptions.OperationalError: index row size 3192 exceeds maximum 2712 for index "ix_web_pages_url" assert len( item['url'] ) < 2712, "URL Too long for postgres. Length %s for url '%s'" % ( len(item['url']), item['url']) if max_pri is None: max_pri = db.DB_LOW_PRIORITY if item['maximum_priority'] < max_pri: item['maximum_priority'] = max_pri if item['distance'] < item['maximum_priority']: item['distance'] = item['maximum_priority'] except AssertionError: logger.error("Missing key from entry: ") item_str = pprint.pformat(item) for line in item_str.split("\n"): logger.error(" %s", line.rstrip()) raise item_keys = set(item.keys()) excess_keys = item_keys - expected_keys try: assert not excess_keys except AssertionError: logger.error("Excess key(s) in entry: '%s'", excess_keys) item_str = pprint.pformat(item) for line in item_str.split("\n"): logger.error(" %s", line.rstrip()) raise # print("item:", item) logger.info("Inserting %s items into DB in batch.", len(link_batch)) # This is kind of horrible. # Reach down through sqlalchemy and pull out the raw cursor directly. try: raw_cur = interface.connection().connection.cursor() except sqlalchemy.exc.InvalidRequestError: interface.rollback() raw_cur = interface.connection().connection.cursor() per_cmd = """ SELECT upsert_link( %(url)s, %(starturl)s, %(netloc)s, %(distance)s, %(is_text)s, %(priority)s, %(type)s, %(addtime)s, %(state)s, %(maximum_priority)s, %(epoch)s ); """.replace(" ", " ") per_cmd = per_cmd.replace("\n", " ") while " " in per_cmd: per_cmd = per_cmd.replace(" ", " ") # Somehow we're getting here with an open transaction. I have no idea what's opening them. # Something something DBAPI raw_cur.execute("COMMIT;") rowcnt = 0 try: for subc in misc.batch(link_batch, 50, show_progress=show_progress): # We don't care about isolation for these operations, as each operation # is functionally independent. raw_cur.execute( "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 5000 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 5000;") # We try the bulk insert command first. psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc) rowcnt += raw_cur.rowcount raw_cur.execute("COMMIT;") raw_cur.execute("RESET statement_timeout;") link_batch = [] logger.info("Touched AT LEAST %s rows", rowcnt) return rowcnt except psycopg2.Error: logger.error("psycopg2.Error - Failure on bulk insert.") for line in traceback.format_exc().split("\n"): logger.error(line) raw_cur.execute("ROLLBACK;") logger.error("Retrying.") rowcnt = 0 try: for subc in misc.batch(link_batch, 5, show_progress=show_progress): # We don't care about isolation for these operations, as each operation # is functionally independent. raw_cur.execute( "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") # We try the bulk insert command first. psycopg_execute_batch.execute_batch(raw_cur, per_cmd, subc) rowcnt += raw_cur.rowcount raw_cur.execute("COMMIT;") raw_cur.execute("RESET statement_timeout;") link_batch = [] logger.info("Touched AT LEAST %s rows", rowcnt) return rowcnt except psycopg2.Error: logger.error("psycopg2.Error - Failure on bulk insert.") for line in traceback.format_exc().split("\n"): logger.error(line) raw_cur.execute("ROLLBACK;") logger.error("Retrying with per upsert commit.") # If the bulk insert failed, we then try a per-URL upsert # We only commit per-URL if we're tried to do per-URL update in batch, and failed. commit_each = False while 1: rowcnt = 0 try: raw_cur.execute( "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;") # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") for paramset in link_batch: assert isinstance(paramset['starturl'], str) if len(paramset['url']) > 2000: logger.error( "URL Is too long to insert into the database!") logger.error("URL: '%s'", paramset['url']) else: # Forward-data the next walk, time, rather then using now-value for the thresh. raw_cur.execute(per_cmd, paramset) rowcnt += raw_cur.rowcount if commit_each: raw_cur.execute("COMMIT;") raw_cur.execute( "BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;" ) # We use a statement timeout context of 2500 ms, so we don't get wedged on a lock. raw_cur.execute("SET statement_timeout TO 2500;") raw_cur.execute("COMMIT;") break except psycopg2.Error: if commit_each is False: logger.warning("psycopg2.Error - Retrying with commit each.") else: logger.warning("psycopg2.Error - Retrying.") traceback.print_exc() raw_cur.execute("ROLLBACK;") commit_each = True raw_cur.execute("RESET statement_timeout;") logger.info("Changed %s rows", rowcnt) return