def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False, name: str = "?") -> None: """ Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfgsection: the name of a CRATE NLP config file section, TO WHICH we will add a "processor:" prefix (from which section we may choose to get extra config information) commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. name: friendly name for the parser """ self._nlpdef = nlpdef self._cfgsection = cfgsection self._commit = commit self._name = name self._destdb_name = None # type: Optional[str] self._destdb = None # type: Optional[DatabaseHolder] if nlpdef is not None: self._sectionname = full_sectionname( NlpConfigPrefixes.PROCESSOR, cfgsection) self._destdb_name = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.DESTDB, required=True) self._destdb = nlpdef.get_database(self._destdb_name) else: self._sectionname = "" self._destdb_name = "" self._destdb = None # type: Optional[DatabaseHolder]
def __init__(self, nlpdef: Optional[NlpDefinition], cfgsection: Optional[str], commit: bool = False) -> None: """ Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfgsection: the config section for the processor commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. """ super().__init__(nlpdef, cfgsection, commit, name="Cloud") self.remote_processor_info = None # type: Optional[ServerProcessor] sectionname = full_sectionname(NlpConfigPrefixes.PROCESSOR, cfgsection) self.procname = nlpdef.opt_str( sectionname, ProcessorConfigKeys.PROCESSOR_NAME, required=True) self.procversion = nlpdef.opt_str( sectionname, ProcessorConfigKeys.PROCESSOR_VERSION, default=None) # Made format required so people are less likely to make mistakes self.format = nlpdef.opt_str( sectionname, ProcessorConfigKeys.PROCESSOR_FORMAT, required=True) self.schema_type = None self.sql_dialect = None self.schema = None # type: Optional[Dict[str, Any]] self.available_remotely = False # update later if available # Output section - bit of repetition from the 'Gate' parser typepairs = nlpdef.opt_strlist( sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False) self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] self.tablename = None # If typepairs is empty the following block won't execute for c in chunks(typepairs, 2): output_type = c[0] outputsection = c[1] output_type = output_type.lower() c = OutputUserConfig(nlpdef.get_parser(), outputsection, schema_required=False) self._outputtypemap[output_type] = c self._type_to_tablename[output_type] = c.get_tablename() if output_type == '""': self.tablename = c.get_tablename()
def __init__(self, nlpdef: NlpDefinition) -> None: """ Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` """ self._nlpdef = nlpdef self._cloudcfg = nlpdef.get_cloud_config_or_raise() self._nlpdef_sectionname = full_sectionname(NlpConfigPrefixes.NLPDEF, self._nlpdef.get_name()) self._auth = (self._cloudcfg.username, self._cloudcfg.password) self._post = self._internal_post self.cookies = None # type: Optional[CookieJar]
def __init__(self, nlpdef: Optional[NlpDefinition], cfg_processor_name: Optional[str], commit: bool = False, friendly_name: str = "?") -> None: r""" ``__init__`` function for :class:`TableMaker`. Args: nlpdef: An instance of :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`. cfg_processor_name: The name of a CRATE NLP config file section, TO WHICH we will add a ``processor:`` prefix (from which section we may choose to get extra config information). commit: Force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. friendly_name: Friendly name for the parser. """ # NB This docstring was associated with Sphinx errors! self._nlpdef = nlpdef self._cfg_processor_name = cfg_processor_name self._commit = commit self._friendly_name = friendly_name self._destdb_name = None # type: Optional[str] self._destdb = None # type: Optional[DatabaseHolder] if nlpdef is None: self._sectionname = "" self._cfgsection = None # type: Optional[ConfigSection] self._destdb_name = "" self._destdb = None # type: Optional[DatabaseHolder] else: self._sectionname = full_sectionname(NlpConfigPrefixes.PROCESSOR, cfg_processor_name) self._cfgsection = nlpdef.get_config_section(self._sectionname) self._destdb_name = self._cfgsection.opt_str( ProcessorConfigKeys.DESTDB, required=True) self._destdb = nlpdef.get_database(self._destdb_name)
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, commit: bool = False) -> None: """ Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfgsection: the name of a CRATE NLP config file section (from which we may choose to get extra config information) commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. """ super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, name="MedEx") if nlpdef is None: # only None for debugging! self._debug_mode = True self._tablename = self.classname().lower() self._max_external_prog_uses = 1 self._progenvsection = "" self._env = {} # type: Dict[str, str] progargs = "" else: self._debug_mode = False self._tablename = nlpdef.opt_str(self._sectionname, ProcessorConfigKeys.DESTTABLE, required=True) self._max_external_prog_uses = nlpdef.opt_int( self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES, default=0) self._progenvsection = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.PROGENVSECTION) if self._progenvsection: self._env = nlpdef.get_env_dict( full_sectionname(NlpConfigPrefixes.ENV, self._progenvsection), os.environ) else: self._env = os.environ.copy() self._env["NLPLOGTAG"] = nlpdef.get_logtag() or '.' # ... because passing a "-lt" switch with no parameter will make # CrateGatePipeline.java complain and stop progargs = nlpdef.opt_str(self._sectionname, ProcessorConfigKeys.PROGARGS, required=True) if USE_TEMP_DIRS: self._inputdir = tempfile.TemporaryDirectory() self._outputdir = tempfile.TemporaryDirectory() self._workingdir = tempfile.TemporaryDirectory() # ... these are autodeleted when the object goes out of scope; see # https://docs.python.org/3/library/tempfile.html # ... which manages it using weakref.finalize else: homedir = os.path.expanduser("~") self._inputdir = PseudoTempDir( os.path.join(homedir, "medextemp", "input")) mkdir_p(self._inputdir.name) self._outputdir = PseudoTempDir( os.path.join(homedir, "medextemp", "output")) mkdir_p(self._outputdir.name) self._workingdir = PseudoTempDir( os.path.join(homedir, "medextemp", "working")) mkdir_p(self._workingdir.name) formatted_progargs = progargs.format(**self._env) self._progargs = shlex.split(formatted_progargs) self._progargs.extend([ "-data_ready_signal", MEDEX_DATA_READY_SIGNAL, "-results_ready_signal", MEDEX_RESULTS_READY_SIGNAL, "-i", self._inputdir.name, "-o", self._outputdir.name, ]) self._n_uses = 0 self._pipe_encoding = 'utf8' self._file_encoding = 'utf8' self._p = None # the subprocess self._started = False
def __init__(self, parser: ExtendedConfigParser, section: str, schema_required: bool = True) -> None: """ Read config from a configparser section. Args: parser: :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` section: config file section name -- this is the second of the pair of strings in the ``outputtypemap`` part of the GATE NLP app config section. See - :ref:`NLP config file <nlp_config>` - :class:`crate_anon.nlp_manager.parse_gate.Gate` schema_required: is it required that the user has specified a schema, i.e. destfields and a desttable? - Should be true for Gate, False for Cloud as the remote processors may have their own schema definition. """ # noqa sectionname = full_sectionname(NlpConfigPrefixes.OUTPUT, section) def opt_str(option: str, required: bool = False) -> str: return parser.get_str(sectionname, option, required=required) def opt_strlist(option: str, required: bool = False, as_words: bool = True) -> List[str]: return parser.get_str_list(sectionname, option, required=required, lower=False, as_words=as_words) # We do NOT change the case. if not parser.has_section(sectionname): raise ValueError("config missing section: " + sectionname) # --------------------------------------------------------------------- # desttable # --------------------------------------------------------------------- self._desttable = opt_str(NlpOutputConfigKeys.DESTTABLE, required=True) ensure_valid_table_name(self._desttable) # --------------------------------------------------------------------- # renames # --------------------------------------------------------------------- self._renames = {} # type: Dict[str, str] rename_lines = opt_strlist(NlpOutputConfigKeys.RENAMES, required=False, as_words=False) for line in rename_lines: if not line.strip(): continue words = shlex.split(line) if len(words) != 2: raise ValueError( f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config " f"section {sectionname!r}; line was {line!r} but should " f"have contained two things") annotation_name = words[0] field_name = words[1] ensure_valid_field_name(field_name) self._renames[annotation_name] = field_name # --------------------------------------------------------------------- # null_literals # --------------------------------------------------------------------- null_literal_lines = opt_strlist(NlpOutputConfigKeys.NULL_LITERALS, required=False, as_words=False) self._null_literals = [] # type: List[str] for line in null_literal_lines: self._null_literals += shlex.split(line) # --------------------------------------------------------------------- # destfields # --------------------------------------------------------------------- self._destfields = [] # type: List[str] self._dest_datatypes = [] # type: List[str] self._dest_comments = [] # type: List[str] dest_field_lines = opt_strlist(NlpOutputConfigKeys.DESTFIELDS, required=schema_required, as_words=False) # ... comments will be removed during that process. # log.critical(dest_field_lines) # If dest_field_lines is empty (as it may be for a Cloud processor) # the following block doesn't execute, so the 'dest' attributed remain # empty for dfl in dest_field_lines: parts = dfl.split(maxsplit=2) assert len(parts) >= 2, f"Bad field definition line: {dfl!r}" field = parts[0] datatype = parts[1].upper() comment = parts[2] if len(parts) > 2 else None ensure_valid_field_name(field) if not is_sqltype_valid(datatype): raise Exception(f"Invalid datatype for {field}: {datatype}") self._destfields.append(field) self._dest_datatypes.append(datatype) self._dest_comments.append(comment) src_fields = [ c.name for c in InputFieldConfig.get_core_columns_for_dest() ] for sf in src_fields: if sf in self._destfields: raise Exception( f"For section {sectionname}, destination field {sf} is " f"auto-supplied; do not add it manually") if len(set(self._destfields)) != len(self._destfields): raise ValueError(f"Duplicate fields exist in destination fields: " f"{self._destfields}") # --------------------------------------------------------------------- # indexdefs # --------------------------------------------------------------------- self._indexfields = [] # type: List[str] self._indexlengths = [] # type: List[int] indexdefs = opt_strlist(NlpOutputConfigKeys.INDEXDEFS) if indexdefs: for c in chunks(indexdefs, 2): # pairs: field, length indexfieldname = c[0] lengthstr = c[1] if indexfieldname not in self._destfields: raise ValueError(f"Index field {indexfieldname} not in " f"destination fields {self._destfields}") try: length = ast.literal_eval(lengthstr) if length is not None: length = int(length) except ValueError: raise ValueError(f"Bad index length: {lengthstr}") self._indexfields.append(indexfieldname) self._indexlengths.append(length)
def __init__(self, nlpdef: "NlpDefinition", name: str, req_data_dir: str) -> None: """ Reads the config from the NLP definition's config file. Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` name: name for the cloud NLP configuration (to which a standard prefix will be added to get the config section name) req_data_dir: directory in which to store temporary request files """ from crate_anon.nlp_manager.cloud_parser import Cloud # delayed import # noqa self._nlpdef = nlpdef self.req_data_dir = req_data_dir cfg = nlpdef.get_config_section( full_sectionname(NlpConfigPrefixes.CLOUD, name)) self.url = cfg.opt_str(CloudNlpConfigKeys.CLOUD_URL, required=True) self.verify_ssl = cfg.opt_bool(CloudNlpConfigKeys.VERIFY_SSL, True) self.compress = cfg.opt_bool(CloudNlpConfigKeys.COMPRESS, True) self.username = cfg.opt_str(CloudNlpConfigKeys.USERNAME, default="") self.password = cfg.opt_str(CloudNlpConfigKeys.PASSWORD, default="") self.max_content_length = cfg.opt_int( CloudNlpConfigKeys.MAX_CONTENT_LENGTH, DEFAULT_CLOUD_MAX_CONTENT_LENGTH) self.limit_before_commit = cfg.opt_int( CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT, DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT) self.max_records_per_request = cfg.opt_int( CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST, DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST) self.stop_at_failure = cfg.opt_bool(CloudNlpConfigKeys.STOP_AT_FAILURE, True) self.wait_on_conn_err = cfg.opt_int( CloudNlpConfigKeys.WAIT_ON_CONN_ERR, DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S) self.max_tries = cfg.opt_int(CloudNlpConfigKeys.MAX_TRIES, DEFAULT_CLOUD_MAX_TRIES) self.rate_limit_hz = cfg.opt_int(CloudNlpConfigKeys.RATE_LIMIT_HZ, DEFAULT_CLOUD_RATE_LIMIT_HZ) self.test_length_function_speed = cfg.opt_bool( CloudNlpConfigKeys.TEST_LENGTH_FUNCTION_SPEED, True) self.remote_processors = {} # type: Dict[Tuple[str, str], 'Cloud'] for processor in self._nlpdef.processors: if not isinstance(processor, Cloud): # ... only add 'Cloud' processors log.warning( f"Skipping NLP processor of non-cloud (e.g. local) " f"type: {processor.friendly_name}") continue self.remote_processors[(processor.procname, processor.procversion)] = processor # NOTE: KEY IS A TUPLE! # We need the following in order to decide whether to ask to include # text in reply - if a processor is GATE we need to, as it does not # send back the content of the nlp snippet self.has_gate_processors = any( (x.format == NlpDefValues.FORMAT_GATE) for x in self.remote_processors.values())
def __init__(self, nlpdef: NlpDefinition, cfgsection: str, commit: bool = False) -> None: """ Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfgsection: the name of a CRATE NLP config file section (from which we may choose to get extra config information) commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. """ super().__init__(nlpdef=nlpdef, cfgsection=cfgsection, commit=commit, name="GATE") if not nlpdef and not cfgsection: # Debugging only self._debug_mode = True self._max_external_prog_uses = 0 self._input_terminator = 'input_terminator' self._output_terminator = 'output_terminator' typepairs = [] # type: List[str] self._progenvsection = '' progargs = '' logtag = '' else: self._debug_mode = False self._max_external_prog_uses = nlpdef.opt_int( self._sectionname, ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES, default=0) self._input_terminator = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.INPUT_TERMINATOR, required=True) self._output_terminator = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.OUTPUT_TERMINATOR, required=True) typepairs = nlpdef.opt_strlist( self._sectionname, ProcessorConfigKeys.OUTPUTTYPEMAP, required=True, lower=False) self._progenvsection = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.PROGENVSECTION) progargs = nlpdef.opt_str( self._sectionname, ProcessorConfigKeys.PROGARGS, required=True) logtag = nlpdef.get_logtag() or '.' self._outputtypemap = {} # type: Dict[str, OutputUserConfig] self._type_to_tablename = {} # type: Dict[str, str] for c in chunks(typepairs, 2): annottype = c[0] outputsection = c[1] # 2018-03-27: not clear why we need to force the user to specify # in lower case! We just said it's case-insensitive. So ditch this: # # if annottype != annottype.lower(): # raise Exception( # "Section {}: annotation types in outputtypemap must be in " # noqa # "lower case: change {}".format(cfgsection, annottype)) # # and add this: annottype = annottype.lower() # log.critical(outputsection) c = OutputUserConfig(nlpdef.get_parser(), outputsection) self._outputtypemap[annottype] = c self._type_to_tablename[annottype] = c.get_tablename() if self._progenvsection: self._env = nlpdef.get_env_dict( full_sectionname(NlpConfigPrefixes.ENV, self._progenvsection), os.environ) else: self._env = os.environ.copy() self._env["NLPLOGTAG"] = logtag # ... We have ensured that this is not empty for real use, because # passing a "-lt" switch with no parameter will make # CrateGatePipeline.java complain and stop. The environment variable # is read via the "progargs" config argument, as follows. formatted_progargs = progargs.format(**self._env) self._progargs = shlex.split(formatted_progargs) self._n_uses = 0 self._pipe_encoding = 'utf8' self._p = None # the subprocess self._started = False # Sanity checks for ty, tn in self._type_to_tablename.items(): assert len(tn) <= MAX_SQL_FIELD_LEN, ( f"Table name too long (max {MAX_SQL_FIELD_LEN} characters)")
def __init__(self, nlpdef: NlpDefinition, cfg_input_name: str) -> None: """ Read config from a configparser section, and also associate with a specific NLP definition. Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, the master NLP definition, referring to the master config file etc. cfg_input_name: config section name for the input field definition """ self.name = cfg_input_name cfg = nlpdef.get_config_section( full_sectionname(NlpConfigPrefixes.INPUT, cfg_input_name)) self._nlpdef = nlpdef self._srcdb = cfg.opt_str(InputFieldConfigKeys.SRCDB) self._srctable = cfg.opt_str(InputFieldConfigKeys.SRCTABLE) self._srcpkfield = cfg.opt_str(InputFieldConfigKeys.SRCPKFIELD) self._srcfield = cfg.opt_str(InputFieldConfigKeys.SRCFIELD) self._srcdatetimefield = cfg.opt_str( InputFieldConfigKeys.SRCDATETIMEFIELD, required=False) # ... new in v0.18.52 # Make these case-sensitive to avoid our failure in renaming SQLA # Column objects to be lower-case: self._copyfields = cfg.opt_multiline( InputFieldConfigKeys.COPYFIELDS) # fieldnames self._indexed_copyfields = cfg.opt_multiline( InputFieldConfigKeys.INDEXED_COPYFIELDS) self._debug_row_limit = cfg.opt_int( InputFieldConfigKeys.DEBUG_ROW_LIMIT, default=0) # self._fetch_sorted = opt_bool('fetch_sorted', default=True) ensure_valid_table_name(self._srctable) ensure_valid_field_name(self._srcpkfield) ensure_valid_field_name(self._srcfield) if self._srcdatetimefield: ensure_valid_field_name(self._srcdatetimefield) if len(set(self._indexed_copyfields)) != len(self._indexed_copyfields): raise ValueError( f"Redundant indexed_copyfields: {self._indexed_copyfields}") if len(set(self._copyfields)) != len(self._copyfields): raise ValueError(f"Redundant copyfields: {self._copyfields}") indexed_not_copied = set(self._indexed_copyfields) - set( self._copyfields) if indexed_not_copied: raise ValueError(f"Fields in index_copyfields but not in " f"copyfields: {indexed_not_copied}") # allfields = [self._srcpkfield, self._srcfield] + self._copyfields # if len(allfields) != len(set(allfields)): # raise ValueError( # f"Field overlap in InputFieldConfig: {section}") # RE-THOUGHT: OK to copy source text fields etc. if desired. # It's fine in SQL to say SELECT a, a FROM mytable; self._db = nlpdef.get_database(self._srcdb)