def drop_remake(progargs, nlpdef: NlpDefinition, incremental: bool = False, skipdelete: bool = False) -> None: """ Drop output tables and recreate them. """ # Not parallel. # ------------------------------------------------------------------------- # 1. Progress database # ------------------------------------------------------------------------- progengine = nlpdef.get_progdb_engine() if not incremental: log.debug("Dropping progress tables") NlpRecord.__table__.drop(progengine, checkfirst=True) log.info("Creating progress table (with index)") NlpRecord.__table__.create(progengine, checkfirst=True) # ------------------------------------------------------------------------- # 2. Output database(s) # ------------------------------------------------------------------------- pretty_names = [] # type: List[str] for processor in nlpdef.get_processors(): new_pretty_names = processor.make_tables(drop_first=not incremental) for npn in new_pretty_names: if npn in pretty_names: log.warning("An NLP processor has tried to re-make a table " "made by one of its colleagues: {}".format(npn)) pretty_names.extend(new_pretty_names) # ------------------------------------------------------------------------- # 3. Delete WHERE NOT IN for incremental # ------------------------------------------------------------------------- for ifconfig in nlpdef.get_ifconfigs(): with MultiTimerContext(timer, TIMING_DELETE_WHERE_NO_SOURCE): if incremental: if not skipdelete: delete_where_no_source( nlpdef, ifconfig, report_every=progargs.report_every_fast, chunksize=progargs.chunksize) else: # full ifconfig.delete_all_progress_records() # ------------------------------------------------------------------------- # 4. Overall commit (superfluous) # ------------------------------------------------------------------------- nlpdef.commit_all()
def process_nlp(nlpdef: NlpDefinition, incremental: bool = False, report_every: int = DEFAULT_REPORT_EVERY_NLP, tasknum: int = 0, ntasks: int = 1) -> None: """ Main NLP processing function. Fetch text, send it to the GATE app (storing the results), and make a note in the progress database. """ log.info(SEP + "NLP") session = nlpdef.get_progdb_session() for ifconfig in nlpdef.get_ifconfigs(): i = 0 # record count within this process recnum = tasknum # record count overall totalcount = ifconfig.get_count() # total number of records in table for text, other_values in ifconfig.gen_text(tasknum=tasknum, ntasks=ntasks): i += 1 pkval = other_values[FN_SRCPKVAL] pkstr = other_values[FN_SRCPKSTR] if report_every and i % report_every == 0: log.info( "Processing {db}.{t}.{c}, PK: {pkf}={pkv} " "({overall}record {approx}{recnum}/{totalcount})" "{thisproc}".format( db=other_values[FN_SRCDB], t=other_values[FN_SRCTABLE], c=other_values[FN_SRCFIELD], pkf=other_values[FN_SRCPKFIELD], pkv=pkstr if pkstr else pkval, overall="overall " if ntasks > 1 else "", approx="~" if pkstr and ntasks > 1 else "", # ... string hashing means approx. distribution recnum=recnum + 1, i=i, totalcount=totalcount, thisproc=(" ({i}/~{proccount} this process)".format( i=i, proccount=totalcount // ntasks) if ntasks > 1 else ""))) recnum += ntasks # log.critical("other_values={}".format(repr(other_values))) srchash = nlpdef.hash(text) progrec = None if incremental: progrec = ifconfig.get_progress_record(pkval, pkstr) if progrec is not None: if progrec.srchash == srchash: log.debug("Record previously processed; skipping") continue else: log.debug("Record has changed") else: log.debug("Record is new") for processor in nlpdef.get_processors(): if incremental: processor.delete_dest_record(ifconfig, pkval, pkstr, commit=incremental) processor.process(text, other_values) # Make a note in the progress database that we've processed a # source record. if progrec: # modifying an existing record progrec.whenprocessedutc = nlpdef.get_now() progrec.srchash = srchash else: # creating a new record progrec = NlpRecord( # Quasi-key fields: srcdb=ifconfig.get_srcdb(), srctable=ifconfig.get_srctable(), srcpkval=pkval, srcpkstr=pkstr, srcfield=ifconfig.get_srcfield(), nlpdef=nlpdef.get_name(), # Other fields: srcpkfield=ifconfig.get_srcpkfield(), whenprocessedutc=nlpdef.get_now(), srchash=srchash, ) with MultiTimerContext(timer, TIMING_PROGRESS_DB_ADD): session.add(progrec) # In incremental mode, do we commit immediately, because other # processes may need this table promptly... ? # force_commit = False # definitely wrong; crashes as below # force_commit = incremental force_commit = ntasks > 1 # - A single source record should not be processed by >1 CRATE # process. So in theory there should be no conflicts. # - However, databases can lock in various ways. Can we guarantee # it'll do something sensible? # - See also # https://en.wikipedia.org/wiki/Isolation_(database_systems) # http://skien.cc/blog/2014/02/06/sqlalchemy-and-race-conditions-follow-up/ # noqa # http://docs.sqlalchemy.org/en/latest/core/connections.html?highlight=execution_options#sqlalchemy.engine.Connection.execution_options # noqa # - However, empirically, setting this to False gives # "Transaction (Process ID xx) was deadlocked on lock resources # with another process and has been chosen as the deadlock # victim. Rerun the transaction." -- with a SELECT query. # - SQL Server uses READ COMMITTED as the default isolation level. # - https://technet.microsoft.com/en-us/library/jj856598(v=sql.110).aspx # noqa nlpdef.notify_transaction( session=session, n_rows=1, n_bytes=sys.getsizeof(progrec), # approx force_commit=force_commit) nlpdef.commit_all()