def show_dest_counts(nlpdef: NlpDefinition) -> None: """ Show the number of records in all destination tables. """ print("DESTINATION TABLE RECORD COUNTS:") counts = [] # type: List[Tuple[str, int]] for processor in nlpdef.get_processors(): session = processor.get_session() dbname = processor.get_dbname() for tablename in processor.get_tablenames(): n = count_star(session, tablename) counts.append(("DESTINATION: {}.{}".format(dbname, tablename), n)) print_record_counts(counts)
def drop_remake(progargs, nlpdef: NlpDefinition, incremental: bool = False, skipdelete: bool = False) -> None: """ Drop output tables and recreate them. """ # Not parallel. # ------------------------------------------------------------------------- # 1. Progress database # ------------------------------------------------------------------------- progengine = nlpdef.get_progdb_engine() if not incremental: log.debug("Dropping progress tables") NlpRecord.__table__.drop(progengine, checkfirst=True) log.info("Creating progress table (with index)") NlpRecord.__table__.create(progengine, checkfirst=True) # ------------------------------------------------------------------------- # 2. Output database(s) # ------------------------------------------------------------------------- pretty_names = [] # type: List[str] for processor in nlpdef.get_processors(): new_pretty_names = processor.make_tables(drop_first=not incremental) for npn in new_pretty_names: if npn in pretty_names: log.warning("An NLP processor has tried to re-make a table " "made by one of its colleagues: {}".format(npn)) pretty_names.extend(new_pretty_names) # ------------------------------------------------------------------------- # 3. Delete WHERE NOT IN for incremental # ------------------------------------------------------------------------- for ifconfig in nlpdef.get_ifconfigs(): with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE): if incremental: if not skipdelete: delete_where_no_source( nlpdef, ifconfig, report_every=progargs.report_every_fast, chunksize=progargs.chunksize) else: # full ifconfig.delete_all_progress_records() # ------------------------------------------------------------------------- # 4. Overall commit (superfluous) # ------------------------------------------------------------------------- nlpdef.commit_all()
def process_nlp(nlpdef: NlpDefinition, incremental: bool = False, report_every: int = DEFAULT_REPORT_EVERY_NLP, tasknum: int = 0, ntasks: int = 1) -> None: """ Main NLP processing function. Fetch text, send it to the GATE app (storing the results), and make a note in the progress database. """ log.info(SEP + "NLP") session = nlpdef.get_progdb_session() for ifconfig in nlpdef.get_ifconfigs(): i = 0 # record count within this process recnum = tasknum # record count overall totalcount = ifconfig.get_count() # total number of records in table for text, other_values in ifconfig.gen_text(tasknum=tasknum, ntasks=ntasks): i += 1 pkval = other_values[FN_SRCPKVAL] pkstr = other_values[FN_SRCPKSTR] if report_every and i % report_every == 0: log.info( "Processing {db}.{t}.{c}, PK: {pkf}={pkv} " "({overall}record {approx}{recnum}/{totalcount})" "{thisproc}".format( db=other_values[FN_SRCDB], t=other_values[FN_SRCTABLE], c=other_values[FN_SRCFIELD], pkf=other_values[FN_SRCPKFIELD], pkv=pkstr if pkstr else pkval, overall="overall " if ntasks > 1 else "", approx="~" if pkstr and ntasks > 1 else "", # ... string hashing means approx. distribution recnum=recnum + 1, i=i, totalcount=totalcount, thisproc=(" ({i}/~{proccount} this process)".format( i=i, proccount=totalcount // ntasks) if ntasks > 1 else ""))) recnum += ntasks # log.critical("other_values={}".format(repr(other_values))) srchash = nlpdef.hash(text) progrec = None if incremental: progrec = ifconfig.get_progress_record(pkval, pkstr) if progrec is not None: if progrec.srchash == srchash: log.debug("Record previously processed; skipping") continue else: log.debug("Record has changed") else: log.debug("Record is new") for processor in nlpdef.get_processors(): if incremental: processor.delete_dest_record(ifconfig, pkval, pkstr, commit=incremental) processor.process(text, other_values) # Make a note in the progress database that we've processed a # source record. if progrec: # modifying an existing record progrec.whenprocessedutc = nlpdef.get_now() progrec.srchash = srchash else: # creating a new record progrec = NlpRecord( # Quasi-key fields: srcdb=ifconfig.get_srcdb(), srctable=ifconfig.get_srctable(), srcpkval=pkval, srcpkstr=pkstr, srcfield=ifconfig.get_srcfield(), nlpdef=nlpdef.get_name(), # Other fields: srcpkfield=ifconfig.get_srcpkfield(), whenprocessedutc=nlpdef.get_now(), srchash=srchash, ) with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD): session.add(progrec) # In incremental mode, do we commit immediately, because other # processes may need this table promptly... ? # force_commit = False # definitely wrong; crashes as below # force_commit = incremental force_commit = ntasks > 1 # - A single source record should not be processed by >1 CRATE # process. So in theory there should be no conflicts. # - However, databases can lock in various ways. Can we guarantee # it'll do something sensible? # - See also # https://en.wikipedia.org/wiki/Isolation_(database_systems) # http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/ # noqa # http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options # noqa # - However, empirically, setting this to False gives # "Transaction (Process ID xx) was deadlocked on lock resources # with another process and has been chosen as the deadlock # victim. Rerun the transaction." -- with a SELECT query. # - SQL Server uses READ COMMITTED as the default isolation level. # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx # noqa nlpdef.notify_transaction( session=session, n_rows=1, n_bytes=sys.getsizeof(progrec), # approx force_commit=force_commit) nlpdef.commit_all()
def delete_where_no_source(nlpdef: NlpDefinition, ifconfig: InputFieldConfig, report_every: int = DEFAULT_REPORT_EVERY, chunksize: int = DEFAULT_CHUNKSIZE) -> None: """ Delete destination records where source records no longer exist. - Can't do this in a single SQL command, since the engine can't necessarily see both databases. - Can't use a single temporary table, since the progress database isn't necessarily the same as any of the destination database(s). - Can't do this in a multiprocess way, because we're trying to do a DELETE WHERE NOT IN. - So we fetch all source PKs (which, by definition, do exist), stash them keep them in memory, and do a DELETE WHERE NOT IN based on those specified values (or, if there are no PKs in the source, delete everything from the destination). Problems: - This is IMPERFECT if we have string source PKs and there are hash collisions (e.g. PKs for records X and Y both hash to the same thing; record X is deleted; then its processed version might not be). - With massive tables, we might run out of memory or (much more likely) SQL parameter slots. -- This is now happening; error looks like: pyodbc.ProgrammingError: ('The SQL contains 30807 parameter parkers, but 2717783 parameters were supplied', 'HY000') A better way might be: - for each table, make a temporary table in the same database - populate that table with (source PK integer/hash, source PK string) pairs - delete where pairs don't match -- is that portable SQL? http://stackoverflow.com/questions/7356108/sql-query-for-deleting-rows-with-not-in-using-2-columns # noqa - More efficient would be to make one table per destination database. On the "delete where multiple fields don't match": - Single field syntax is DELETE FROM a WHERE a1 NOT IN (SELECT b1 FROM b) - Multiple field syntax is DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND a.a2 = b.b2 ) - In SQLAlchemy, exists(): http://stackoverflow.com/questions/14600619 http://docs.sqlalchemy.org/en/latest/core/selectable.html - Furthermore, in SQL NULL = NULL is false, and NULL <> NULL is also false, so we have to do an explicit null check. You do that with "field == None" (disable See http://stackoverflow.com/questions/21668606 We're aiming, therefore, for: DELETE FROM a WHERE NOT EXISTS ( SELECT 1 FROM b WHERE a.a1 = b.b1 AND ( a.a2 = b.b2 OR (a.a2 IS NULL AND b.b2 IS NULL) ) ) """ # ------------------------------------------------------------------------- # Sub-functions # ------------------------------------------------------------------------- def insert(records_): n_rows = len(records_) log.debug("... inserting {} records".format(n_rows)) for db in databases: session_ = db['session'] temptable_ = db['temptable'] # type: Table session_.execute(temptable_.insert(), records_) nlpdef.notify_transaction(session_, n_rows=n_rows, n_bytes=sys.getsizeof(records_)) def commit(): for db in databases: nlpdef.commit(db['session']) # ------------------------------------------------------------------------- # Main code # ------------------------------------------------------------------------- # Use info log level, otherwise it looks like our code hangs with very # large databases. log.info("delete_where_no_source: examining source table {}.{}; " "MAY BE SLOW".format(ifconfig.get_srcdb(), ifconfig.get_srctable())) # Start our list with the progress database databases = [{ 'session': nlpdef.get_progdb_session(), 'engine': nlpdef.get_progdb_engine(), 'metadata': nlpdef.get_progdb_metadata(), 'temptable': None, # type: Table }] # Add the processors' destination databases for processor in nlpdef.get_processors(): # of type BaseNlpParser session = processor.get_session() if any(x['session'] == session for x in databases): continue # already exists databases.append({ 'session': session, 'engine': processor.get_engine(), 'metadata': processor.get_metadata(), }) # Make a temporary table in each database (note: the Table objects become # affiliated to their engine, I think, so make separate ones for each). log.info("... using {n} destination database(s)".format(n=len(databases))) log.info("... dropping (if exists) and creating temporary table(s)") for database in databases: engine = database['engine'] temptable = Table( nlpdef.get_temporary_tablename(), database['metadata'], Column(FN_SRCPKVAL, BigInteger), # not PK, as may be a hash Column(FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH)), **TABLE_KWARGS) temptable.drop(engine, checkfirst=True) temptable.create(engine, checkfirst=True) database['temptable'] = temptable # Insert PKs into temporary tables n = count_star(ifconfig.get_source_session(), ifconfig.get_srctable()) log.info("... populating temporary table(s): {} records to go; working in " "chunks of {}".format(n, chunksize)) i = 0 records = [] # type: List[Dict[str, Any]] for pkval, pkstr in ifconfig.gen_src_pks(): i += 1 if report_every and i % report_every == 0: log.info("... src row# {} / {}".format(i, n)) records.append({FN_SRCPKVAL: pkval, FN_SRCPKSTR: pkstr}) if i % chunksize == 0: insert(records) records = [] # type: List[Dict[str, Any]] if records: # remainder insert(records) # Commit commit() # Index, for speed log.info("... creating index(es) on temporary table(s)") for database in databases: temptable = database['temptable'] # type: Table index = Index('_temptable_idx', temptable.columns[FN_SRCPKVAL]) index.create(database['engine']) # DELETE FROM desttable WHERE destpk NOT IN (SELECT srcpk FROM temptable) log.info("... deleting from progress/destination DBs where appropriate") # Delete from progress database prog_db = databases[0] prog_temptable = prog_db['temptable'] ifconfig.delete_progress_records_where_srcpk_not(prog_temptable) # Delete from others for processor in nlpdef.get_processors(): database = [ x for x in databases if x['session'] == processor.get_session() ][0] temptable = database['temptable'] processor.delete_where_srcpk_not(ifconfig, temptable) # Drop temporary tables log.info("... dropping temporary table(s)") for database in databases: database['temptable'].drop(database['engine'], checkfirst=True) # Commit commit()