def get_drawing_config_by_storage_form(directory, term): cache = get_drawing_config_by_storage_form.__cache if directory not in cache: d = {} for n in get_drawing_config(directory): t = n.storage_form() if t in d: Messager.warning("Project configuration: term %s appears multiple times, only using last. Configuration may be wrong." % t, 5) d[t] = {} for a in n.arguments: if len(n.arguments[a]) != 1: Messager.warning("Project configuration: expected single value for %s argument %s, got '%s'. Configuration may be wrong." % (t, a, "|".join(n.arguments[a]))) else: d[t][a] = n.arguments[a][0] # TODO: hack to get around inability to have commas in values; # fix original issue instead for t in d: for k in d[t]: d[t][k] = d[t][k].replace("-", ",") # propagate defaults (TODO: get rid of magic "DEFAULT" values) default_keys = [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT] for default_dict in [d.get(dk, {}) for dk in default_keys]: for k in default_dict: for t in d: d[t][k] = d[t].get(k, default_dict[k]) cache[directory] = d return cache[directory].get(term, None)
def _check_DB_version(database): import fbkvdb if not fbkvdb.check_version(database): from message import Messager Messager.warning( "Warning: norm DB version mismatch: expected %s, got %s for %s" % (fbkvdb.NORM_DB_VERSION, fbkvdb.get_version(database), database))
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return (None, Simstring.DEFAULT_UNICODE) else: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: # TODO THIS IS WRONG dbname, dbpath, dbunicode = entry[0], entry[3], entry[4] if dbname == database: return (dbpath, dbunicode) # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE) except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return (None, Simstring.DEFAULT_UNICODE)
def get_drawing_config_by_storage_form(directory, term): cache = get_drawing_config_by_storage_form.__cache if directory not in cache: d = {} for n in get_drawing_config(directory): t = n.storage_form() if t in d: Messager.warning( "Project configuration: term %s appears multiple times, only using last. Configuration may be wrong." % t, 5) d[t] = {} for a in n.arguments: if len(n.arguments[a]) != 1: Messager.warning( "Project configuration: expected single value for %s argument %s, got '%s'. Configuration may be wrong." % (t, a, "|".join(n.arguments[a]))) else: d[t][a] = n.arguments[a][0] # TODO: hack to get around inability to have commas in values; # fix original issue instead for t in d: for k in d[t]: d[t][k] = d[t][k].replace("-", ",") # propagate defaults (TODO: get rid of magic "DEFAULT" values) default_keys = [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT] for default_dict in [d.get(dk, {}) for dk in default_keys]: for k in default_dict: for t in d: d[t][k] = d[t].get(k, default_dict[k]) cache[directory] = d return cache[directory].get(term, None)
def wrapper(*args, **kwds): if DEBUG: Messager.warning( ('Client sent "%s" action ' 'which is marked as deprecated') % func.__name__,) return func(*args, **kwds)
def _get_match_regex(text, text_match="word", match_case=False, whole_string=False): """ Helper for the various search_anns_for_ functions. """ if match_case: regex_flags = 0 else: regex_flags = re.IGNORECASE if text is None: text = '' if text_match == "word": # full word match: require word boundaries or, optionally, # whole string boundaries if whole_string: return re.compile(r'^'+re.escape(text)+r'$', regex_flags) else: return re.compile(r'\b'+re.escape(text)+r'\b', regex_flags) elif text_match == "substring": # any substring match, as text (nonoverlapping matches) return re.compile(re.escape(text), regex_flags) elif text_match == "regex": try: return re.compile(text, regex_flags) except: # whatever (sre_constants.error, other?) Messager.warning('Given string "%s" is not a valid regular expression.' % text) return None else: Messager.error('Unrecognized search match specification "%s"' % text_match) return None
def attributes_for(self, ann_type): """ Returs a list of the possible attribute types for an annotation of the given type. """ attrs = [] for attr in get_attribute_type_list(self.directory): if attr == SEPARATOR_STR: continue if 'Arg' not in attr.arguments: Messager.warning( "Project configuration: config error: attribute '%s' lacks 'Arg:' specification." % attr.storage_form()) continue types = attr.arguments['Arg'] if ((ann_type in types) or (self.is_event_type(ann_type) and '<EVENT>' in types) or (self.is_physical_entity_type(ann_type) and '<ENTITY>' in types)): attrs.append(attr.storage_form()) return attrs
def norm_get_name(database, key, collection=None): if NORM_LOOKUP_DEBUG: _check_DB_version(database) if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() dbpath = _get_db_path(database, collection) if dbpath is None: # full path not configured, fall back on name as default dbpath = database try: data = normdb.data_by_id(dbpath, key) except normdb.dbNotFoundError as e: Messager.warning(str(e)) data = None # just grab the first one (sorry, this is a bit opaque) if data is not None: value = data[0][0][1] else: value = None if REPORT_LOOKUP_TIMINGS: _report_timings(database, lookup_start) # echo request for sync json_dic = { 'database': database, 'key': key, 'value': value } return json_dic
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection #undo_resp = {} # TODO real_dir = real_directory(directory) #mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def get_configs(directory, filename, defaultstr, minconf, sections): if (directory, filename) not in get_configs.__cache: configstr, source = __read_first_in_directory_tree(directory, filename) if configstr is None: # didn't get one; try default dir and fall back to the default configstr = __read_or_default(filename, defaultstr) if configstr == defaultstr: Messager.info( "Project configuration: no configuration file (%s) found, using default." % filename, 5) source = "[default]" else: source = filename # try to parse what was found, fall back to minimal config try: configs = __parse_configs(configstr, source, sections) except: Messager.warning( "Project configuration: Falling back to minimal default. Configuration is likely wrong.", 5) configs = minconf get_configs.__cache[(directory, filename)] = configs return get_configs.__cache[(directory, filename)]
def norm_get_data(database, key, collection=None): if NORM_LOOKUP_DEBUG: _check_DB_version(database) if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() dbpath = _get_db_path(database, collection) if dbpath is None: # full path not configured, fall back on name as default dbpath = database try: data = normdb.data_by_id(dbpath, key) except normdb.dbNotFoundError as e: Messager.warning(str(e)) data = None if data is None: Messager.warning("Failed to get data for " + database + ":" + key) if REPORT_LOOKUP_TIMINGS: _report_timings(database, lookup_start) # echo request for sync json_dic = { 'database': database, 'key': key, 'value': data } return json_dic
def _parse_attributes(attributes): if attributes is None: _attributes = {} else: try: _attributes = json_loads(attributes) except ValueError: # Failed to parse, warn the client Messager.warning( ('Unable to parse attributes string "%s" for ' '"createSpan", ignoring attributes for request and ' 'assuming no attributes set') % (attributes, )) _attributes = {} # XXX: Hack since the client is sending back False and True as values... # These are __not__ to be sent, they violate the protocol for _del in [k for k, v in list(_attributes.items()) if v == False]: del _attributes[_del] # These are to be old-style modifiers without values for _revalue in [k for k, v in list(_attributes.items()) if v]: _attributes[_revalue] = True ### return _attributes
def arc_types_from_to(self, from_ann, to_ann="<ANY>", include_special=False): """ Returns the possible arc types that can connect an annotation of type from_ann to an annotation of type to_ann. If to_ann has the value \"<ANY>\", returns all possible arc types. """ from_node = get_node_by_storage_form(self.directory, from_ann) if from_node is None: Messager.warning("Project configuration: unknown textbound/event type %s. Configuration may be wrong." % from_ann) return [] if to_ann == "<ANY>": relations_from = get_relations_by_arg1(self.directory, from_ann, include_special) # TODO: consider using from_node.arg_list instead of .arguments for order return unique_preserve_order([role for role in from_node.arguments] + [r.storage_form() for r in relations_from]) # specific hits types = from_node.keys_by_type.get(to_ann, []) if "<ANY>" in from_node.keys_by_type: types += from_node.keys_by_type["<ANY>"] # generic arguments if self.is_event_type(to_ann) and '<EVENT>' in from_node.keys_by_type: types += from_node.keys_by_type['<EVENT>'] if self.is_physical_entity_type(to_ann) and '<ENTITY>' in from_node.keys_by_type: types += from_node.keys_by_type['<ENTITY>'] # relations types.extend(self.relation_types_from_to(from_ann, to_ann)) return unique_preserve_order(types)
def __directory_relations_by_arg_num(directory, num, atype, include_special=False): assert num >= 0 and num < 2, "INTERNAL ERROR" rels = [] for r in get_relation_type_list(directory): # "Special" nesting relation ignored unless specifically # requested if r.storage_form() == ENTITY_NESTING_TYPE and not include_special: continue if len(r.arg_list) != 2: Messager.warning( "Relation type %s has %d arguments in configuration (%s; expected 2). Please fix configuration." % (r.storage_form(), len(r.arg_list), ",".join(r.arg_list))) else: types = r.arguments[r.arg_list[num]] for type in types: # TODO: "wildcards" other than <ANY> if type == "<ANY>" or atype == "<ANY>" or type == atype: rels.append(r) return rels
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, "r") as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error("Error reading text file: nonstandard encoding or binary?", -1) raise UnableToReadTextFile(txt_file_path) j_dic["text"] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == "mecab": from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == "whitespace": from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == "ptblike": from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning("Unrecognized tokenisation option " ", reverting to whitespace tokenisation.") from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic["token_offsets"] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == "newline": from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == "regex": from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning("Unrecognized sentence splitting option " ", reverting to newline sentence splitting.") from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic["sentence_offsets"] = [o for o in ss_offset_gen(text)] return True
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann =_create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def create_arc(collection, document, origin, target, type, attributes=None, old_type=None, old_target=None, comment=None): directory = collection undo_resp = {} real_dir = real_directory(directory) mods = ModificationTracker() projectconf = ProjectConfiguration(real_dir) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only # TODO: make consistent across the different editing # functions, integrate ann_obj initialization and checks if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) origin = ann_obj.get_ann_by_id(origin) target = ann_obj.get_ann_by_id(target) # if there is a previous annotation and the arcs aren't in # the same category (e.g. relation vs. event arg), process # as delete + create instead of update. if old_type is not None and ( projectconf.is_relation_type(old_type) != projectconf.is_relation_type(type) or projectconf.is_equiv_type(old_type) != projectconf.is_equiv_type(type)): _delete_arc_with_ann(origin.id, old_target, old_type, mods, ann_obj, projectconf) old_target, old_type = None, None if projectconf.is_equiv_type(type): ann = _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) elif projectconf.is_relation_type(type): ann = _create_relation(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) else: ann = _create_argument(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target) # process comments if ann is not None: _set_comments(ann_obj, ann, comment, mods, undo_resp=undo_resp) elif comment is not None: Messager.warning( 'create_arc: non-empty comment for None annotation (unsupported type for comment?)') mods_json = mods.json_response() mods_json['annotations'] = _json_from_ann(ann_obj) return mods_json
def jp_token_boundary_gen(text): # TODO: consider honoring WHITESPACE_TOKENIZATION for japanese also if TOKENIZATION is not None and TOKENIZATION != JAPANESE_TOKENIZATION: from message import Messager Messager.warning('Ignoring unexpected TOKENIZATION ' 'specification for Japanese.') from mecab import token_offsets_gen for o in token_offsets_gen(text): yield o
def en_token_boundary_gen(text): if TOKENIZATION is None or TOKENIZATION == WHITESPACE_TOKENIZATION: return en_token_boundary_gen_simple(text) elif TOKENIZATION == PTBLIKE_TOKENIZATION: return en_token_boundary_gen_gtb(text) else: from message import Messager Messager.warning('Unrecognized Engligh tokenization options ' 'for English, reverting to simple tokenization.') return en_token_boundary_gen_simple(text)
def multiple_allowed_arguments(self, type): """ Returns the argument types that are allowed to be filled more than once for an annotation of the given type. """ node = get_node_by_storage_form(self.directory, type) if node is None: Messager.warning("Project configuration: unknown event type %s. Configuration may be wrong." % type) return [] return node.multiple_allowed_arguments
def mandatory_arguments(self, type): """ Returns the mandatory argument types that must be present for an annotation of the given type. """ node = get_node_by_storage_form(self.directory, type) if node is None: Messager.warning("Project configuration: unknown event type %s. Configuration may be wrong." % type) return [] return node.mandatory_arguments
def search_anns_for_event(ann_objs, trigger_text, args, restrict_types=[], ignore_types=[]): """ Searches the given Annotations objects for Event annotations matching the given specification. Returns a SearchMatchSet object. """ # treat None and empty list uniformly restrict_types = [] if restrict_types is None else restrict_types ignore_types = [] if ignore_types is None else ignore_types # TODO: include args in description description = "Event triggered by text containing '%s'" % trigger_text if restrict_types != []: description = description + ' (of type %s)' % (",".join(restrict_types)) matches = SearchMatchSet(description) for ann_obj in ann_objs: # collect per-document (ann_obj) for sorting ann_matches = [] for e in ann_obj.get_events(): if e.type in ignore_types: continue if restrict_types != [] and e.type not in restrict_types: continue try: t_ann = ann_obj.get_ann_by_id(e.trigger) except: # TODO: specific exception Messager.error('Failed to retrieve trigger annotation %s, skipping event %s in search' % (e.trigger, e.id)) # TODO: make options for "text included" vs. "text matches" # TODO: remove temporary hack giving special status to "*" if (trigger_text != None and trigger_text != "" and trigger_text != "*" and trigger_text not in t_ann.text): continue # TODO: argument constraints if len(args) != 0: Messager.warning('NOTE: ignoring event argument constraints in search (not implemented yet, sorry!)') ann_matches.append((t_ann, e)) # sort by trigger start offset ann_matches.sort(lambda a,b: cmp((a[0].start,-a[0].end),(b[0].start,-b[0].end))) # add to overall collection for t_obj, e in ann_matches: matches.add_match(ann_obj, e) # sort by document name for output matches.sort_matches() return matches
def norm_get_data(database, key): if NORM_LOOKUP_DEBUG: _check_DB_version(database) if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() try: data = normdb.data_by_id(database, key) except normdb.dbNotFoundError, e: Messager.warning(str(e)) data = None
def get_labels(directory): cache = get_labels.__cache if directory not in cache: l = {} for t in get_visual_configs(directory)[LABEL_SECTION]: if t.storage_form() in l: Messager.warning("In configuration, labels for '%s' defined more than once. Only using the last set." % t.storage_form(), -1) # first is storage for, rest are labels. l[t.storage_form()] = t.terms[1:] cache[directory] = l return cache[directory]
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error( 'Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text from logging import info as log_info tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def norm_search(database, name, collection=None, exactmatch=False): try: return _norm_search_impl(database, name, collection, exactmatch) except simstringdb.ssdbNotFoundError as e: Messager.warning(str(e)) return { 'database': database, 'query': name, 'header': [], 'items': [] }
def mandatory_arguments(self, type): """ Returns the mandatory argument types that must be present for an annotation of the given type. """ node = get_node_by_storage_form(self.directory, type) if node is None: Messager.warning( "Project configuration: unknown event type %s. Configuration may be wrong." % type) return [] return node.mandatory_arguments
def multiple_allowed_arguments(self, type): """ Returns the argument types that are allowed to be filled more than once for an annotation of the given type. """ node = get_node_by_storage_form(self.directory, type) if node is None: Messager.warning( "Project configuration: unknown event type %s. Configuration may be wrong." % type) return [] return node.multiple_allowed_arguments
def get_node_by_storage_form(directory, term): cache = get_node_by_storage_form.__cache if directory not in cache: d = {} for e in get_entity_type_list(directory) + get_event_type_list(directory): t = e.storage_form() if t in d: Messager.warning("Project configuration: term %s appears multiple times, only using last. Configuration may be wrong." % t, 5) d[t] = e cache[directory] = d return cache[directory].get(term, None)
def get_labels(directory): cache = get_labels.__cache if directory not in cache: l = {} for t in get_visual_configs(directory)[LABEL_SECTION]: if t.storage_form() in l: Messager.warning( "In configuration, labels for '%s' defined more than once. Only using the last set." % t.storage_form(), -1) # first is storage for, rest are labels. l[t.storage_form()] = t.terms[1:] cache[directory] = l return cache[directory]
def compcode(compcode, collection, document): ## We want to write the compcode and user somewhere try: user = get_session()['user'] except KeyError: Messager.warning('Not logged in??') user = '******' with open('/afs/inf.ed.ac.uk/web/securepages/clai/web/brat/work/userlog.txt', 'a') as f: f.write("COMPLETION, %s, %s, %s, %s\n" % (str(datetime.now()), user, compcode, collection)) Messager.info('Thank you! Task completion has been logged!') return {}
def _enrich_json_with_text(j_dic, txt_file_path, raw_text=None): if raw_text is not None: # looks like somebody read this already; nice text = raw_text else: # need to read raw text try: with open_textfile(txt_file_path, 'r') as txt_file: text = txt_file.read() except IOError: raise UnableToReadTextFile(txt_file_path) except UnicodeDecodeError: Messager.error('Error reading text file: nonstandard encoding or binary?', -1) raise UnableToReadTextFile(txt_file_path) j_dic['text'] = text tokeniser = options_get_tokenization(dirname(txt_file_path)) # First, generate tokenisation if tokeniser == 'mecab': from tokenise import jp_token_boundary_gen tok_offset_gen = jp_token_boundary_gen elif tokeniser == 'whitespace': from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen elif tokeniser == 'ptblike': from tokenise import gtb_token_boundary_gen tok_offset_gen = gtb_token_boundary_gen else: Messager.warning('Unrecognized tokenisation option ' ', reverting to whitespace tokenisation.') from tokenise import whitespace_token_boundary_gen tok_offset_gen = whitespace_token_boundary_gen j_dic['token_offsets'] = [o for o in tok_offset_gen(text)] ssplitter = options_get_ssplitter(dirname(txt_file_path)) if ssplitter == 'newline': from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen elif ssplitter == 'regex': from ssplit import regex_sentence_boundary_gen ss_offset_gen = regex_sentence_boundary_gen else: Messager.warning('Unrecognized sentence splitting option ' ', reverting to newline sentence splitting.') from ssplit import newline_sentence_boundary_gen ss_offset_gen = newline_sentence_boundary_gen j_dic['sentence_offsets'] = [o for o in ss_offset_gen(text)] return True
def _parse_span_normalizations(normalizations): if normalizations is None: _normalizations = {} else: try: _normalizations = json_loads(normalizations) except ValueError: # Failed to parse, warn the client Messager.warning((u'Unable to parse normalizations string "%s" for ' u'"createSpan", ignoring normalizations for request and ' u'assuming no normalizations set') % (normalizations, )) _normalizations = {} return _normalizations
def get_node_by_storage_form(directory, term): cache = get_node_by_storage_form.__cache if directory not in cache: d = {} for e in get_entity_type_list(directory) + get_event_type_list( directory): t = e.storage_form() if t in d: Messager.warning( "Project configuration: term %s appears multiple times, only using last. Configuration may be wrong." % t, 5) d[t] = e cache[directory] = d return cache[directory].get(term, None)
def norm_get_name(database, key, collection=None): if NORM_LOOKUP_DEBUG: _check_DB_version(database) if REPORT_LOOKUP_TIMINGS: lookup_start = datetime.now() dbpath = _get_db_path(database, collection) if dbpath is None: # full path not configured, fall back on name as default dbpath = database try: data = normdb.data_by_id(dbpath, key) except normdb.dbNotFoundError, e: Messager.warning(str(e)) data = None
def __parse_kb_shortcuts(shortcutstr, default, source): try: shortcuts = {} for l in shortcutstr.split("\n"): l = l.strip() if l == "" or l[:1] == "#": continue key, type = re.split(r'[ \t]+', l) if key in shortcuts: Messager.warning("Project configuration: keyboard shortcut for '%s' defined multiple times. Ignoring all but first ('%s')" % (key, shortcuts[key])) else: shortcuts[key] = type except: # TODO: specific exception handling Messager.warning("Project configuration: error parsing keyboard shortcuts from %s. Configuration may be wrong." % source, 5) shortcuts = default return shortcuts
def store_svg(collection, document, svg): stored = [] _save_svg(collection, document, svg) stored.append({"name": "svg", "suffix": SVG_SUFFIX}) # attempt conversions from SVG to other formats try: from config import SVG_CONVERSION_COMMANDS except ImportError: SVG_CONVERSION_COMMANDS = [] for format, command in SVG_CONVERSION_COMMANDS: try: from os import system svgfn = _svg_path() # TODO: assuming format name matches suffix; generalize outfn = svgfn.replace("." + SVG_SUFFIX, "." + format) cmd = command % (svgfn, outfn) import logging logging.error(cmd) retval = system(cmd) # TODO: this check may not work on all architectures. # consider rather checking is the intended output file # exists (don't forget to delete a possible old one # with the same name, though). # if retval != 0: # stored.append({'name': format, 'suffix': format}) # else: # Messager.warning("Failed conversion to %s" % format) # I'm getting weird return values from inkscape; will # just assume everything's OK ... # TODO: check return value, react appropriately stored.append({"name": format, "suffix": format}) except: # whatever Messager.warning("Failed conversion to %s" % format) # no luck, but doesn't matter pass return {"stored": stored}
def store_svg(collection, document, svg): stored = [] _save_svg(collection, document, svg) stored.append({'name': 'svg', 'suffix': SVG_SUFFIX}) # attempt conversions from SVG to other formats try: from config import SVG_CONVERSION_COMMANDS except ImportError: SVG_CONVERSION_COMMANDS = [] for format, command in SVG_CONVERSION_COMMANDS: try: from os import system svgfn = _svg_path() # TODO: assuming format name matches suffix; generalize outfn = svgfn.replace('.'+SVG_SUFFIX, '.'+format) cmd = command % (svgfn, outfn) import logging logging.error(cmd) retval = system(cmd) # TODO: this check may not work on all architectures. # consider rather checking is the intended output file # exists (don't forget to delete a possible old one # with the same name, though). # if retval != 0: # stored.append({'name': format, 'suffix': format}) # else: # Messager.warning("Failed conversion to %s" % format) # I'm getting weird return values from inkscape; will # just assume everything's OK ... # TODO: check return value, react appropriately stored.append({'name': format, 'suffix': format}) except: # whatever Messager.warning("Failed conversion to %s" % format) # no luck, but doesn't matter pass return { 'stored' : stored }
def arc_types_from_to(self, from_ann, to_ann="<ANY>", include_special=False): """ Returns the possible arc types that can connect an annotation of type from_ann to an annotation of type to_ann. If to_ann has the value \"<ANY>\", returns all possible arc types. """ from_node = get_node_by_storage_form(self.directory, from_ann) if from_node is None: Messager.warning( "Project configuration: unknown textbound/event type %s. Configuration may be wrong." % from_ann) return [] if to_ann == "<ANY>": relations_from = get_relations_by_arg1(self.directory, from_ann, include_special) # TODO: consider using from_node.arg_list instead of .arguments for order return unique_preserve_order( [role for role in from_node.arguments] + [r.storage_form() for r in relations_from]) # specific hits types = from_node.keys_by_type.get(to_ann, []) if "<ANY>" in from_node.keys_by_type: types += from_node.keys_by_type["<ANY>"] # generic arguments if self.is_event_type(to_ann) and '<EVENT>' in from_node.keys_by_type: types += from_node.keys_by_type['<EVENT>'] if self.is_physical_entity_type( to_ann) and '<ENTITY>' in from_node.keys_by_type: types += from_node.keys_by_type['<ENTITY>'] # relations types.extend(self.relation_types_from_to(from_ann, to_ann)) return unique_preserve_order(types)
def __parse_kb_shortcuts(shortcutstr, default, source): try: shortcuts = {} for l in shortcutstr.split("\n"): l = l.strip() if l == "" or l[:1] == "#": continue key, type = re.split(r'[ \t]+', l) if key in shortcuts: Messager.warning( "Project configuration: keyboard shortcut for '%s' defined multiple times. Ignoring all but first ('%s')" % (key, shortcuts[key])) else: shortcuts[key] = type except: # TODO: specific exception handling Messager.warning( "Project configuration: error parsing keyboard shortcuts from %s. Configuration may be wrong." % source, 5) shortcuts = default return shortcuts
def __directory_relations_by_arg_num(directory, num, atype, include_special=False): assert num >= 0 and num < 2, "INTERNAL ERROR" rels = [] for r in get_relation_type_list(directory): # "Special" nesting relation ignored unless specifically # requested if r.storage_form() == ENTITY_NESTING_TYPE and not include_special: continue if len(r.arg_list) != 2: Messager.warning("Relation type %s has %d arguments in configuration (%s; expected 2). Please fix configuration." % (r.storage_form(), len(r.arg_list), ",".join(r.arg_list))) else: types = r.arguments[r.arg_list[num]] for type in types: # TODO: "wildcards" other than <ANY> if type == "<ANY>" or atype == "<ANY>" or type == atype: rels.append(r) return rels
def reverse_arc(collection, document, origin, target, type, attributes=None): directory = collection # undo_resp = {} # TODO real_dir = real_directory(directory) # mods = ModificationTracker() # TODO projectconf = ProjectConfiguration(real_dir) document = urllib.parse.unquote(document) document = path_join(real_dir, document) with TextAnnotations(document) as ann_obj: # bail as quick as possible if read-only if ann_obj._read_only: raise AnnotationsIsReadOnlyError(ann_obj.get_document()) if projectconf.is_equiv_type(type): Messager.warning('Cannot reverse Equiv arc') elif not projectconf.is_relation_type(type): Messager.warning('Can only reverse configured binary relations') else: # OK to reverse found = None # TODO: more sensible lookup for ann in ann_obj.get_relations(): if (ann.arg1 == origin and ann.arg2 == target and ann.type == type): found = ann break if found is None: Messager.error( 'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type))) else: # found it; just adjust this found.arg1, found.arg2 = found.arg2, found.arg1 # TODO: modification tracker json_response = {} json_response['annotations'] = _json_from_ann(ann_obj) return json_response
def _get_db_path(database, collection): if collection is None: # TODO: default to WORK_DIR config? return None else: try: conf_dir = real_directory(collection) projectconf = ProjectConfiguration(conf_dir) norm_conf = projectconf.get_normalization_config() for entry in norm_conf: dbname, dbpath = entry[0], entry[3] if dbname == database: return dbpath # not found in config. Messager.warning('DB ' + database + ' not defined in config for ' + collection + ', falling back on default.') return None except Exception: # whatever goes wrong, just warn and fall back on the default. Messager.warning('Failed to get DB path from config for ' + collection + ', falling back on default.') return None
def attributes_for(self, ann_type): """ Returs a list of the possible attribute types for an annotation of the given type. """ attrs = [] for attr in get_attribute_type_list(self.directory): if attr == SEPARATOR_STR: continue if 'Arg' not in attr.arguments: Messager.warning("Project configuration: config error: attribute '%s' lacks 'Arg:' specification." % attr.storage_form()) continue types = attr.arguments['Arg'] if ((ann_type in types) or (self.is_event_type(ann_type) and '<EVENT>' in types) or (self.is_physical_entity_type(ann_type) and '<ENTITY>' in types)): attrs.append(attr.storage_form()) return attrs
def get_configs(directory, filename, defaultstr, minconf, sections): if (directory, filename) not in get_configs.__cache: configstr, source = __read_first_in_directory_tree(directory, filename) if configstr is None: # didn't get one; try default dir and fall back to the default configstr = __read_or_default(filename, defaultstr) if configstr == defaultstr: Messager.info("Project configuration: no configuration file (%s) found, using default." % filename, 5) source = "[default]" else: source = filename # try to parse what was found, fall back to minimal config try: configs = __parse_configs(configstr, source, sections) except: Messager.warning("Project configuration: Falling back to minimal default. Configuration is likely wrong.", 5) configs = minconf get_configs.__cache[(directory, filename)] = configs return get_configs.__cache[(directory, filename)]
def __doc_or_dir_to_annotations(directory, document, scope): """ Given a directory, a document, and a scope specification with the value "collection" or "document" selecting between the two, returns Annotations object for either the specific document identified (scope=="document") or all documents in the given directory (scope=="collection"). """ # TODO: lots of magic values here; try to avoid this if scope == "collection": return __directory_to_annotations(directory) elif scope == "document": # NOTE: "/NO-DOCUMENT/" is a workaround for a brat # client-server comm issue (issue #513). if document == "" or document == "/NO-DOCUMENT/": Messager.warning('No document selected for search in document.') return [] else: return __document_to_annotations(directory, document) else: Messager.error('Unrecognized search scope specification %s' % scope) return []
def _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target): # due to legacy representation choices for Equivs (i.e. no # unique ID), support for attributes for Equivs would need # some extra work. Getting the easy non-Equiv case first. if attributes is not None: Messager.warning( '_create_equiv: attributes for Equiv annotation not supported yet, please tell the devs if you need this feature (mention "issue #799").' ) attributes = None ann = None if old_type is None: # new annotation # sanity assert old_target is None, '_create_equiv: incoherent args: old_type is None, old_target is not None (client/protocol error?)' ann = EquivAnnotation( type, [unicode(origin.id), unicode(target.id)], '') ann_obj.add_annotation(ann) mods.addition(ann) # TODO: attributes assert attributes is None, "INTERNAL ERROR" # see above else: # change to existing Equiv annotation. Other than the no-op # case, this remains TODO. assert projectconf.is_equiv_type( old_type ), 'attempting to change equiv relation to non-equiv relation, operation not supported' # sanity assert old_target is not None, '_create_equiv: incoherent args: old_type is not None, old_target is None (client/protocol error?)' if old_type != type: Messager.warning( '_create_equiv: equiv type change not supported yet, please tell the devs if you need this feature (mention "issue #798").' ) if old_target != target.id: Messager.warning( '_create_equiv: equiv reselect not supported yet, please tell the devs if you need this feature (mention "issue #797").' ) # TODO: attributes assert attributes is None, "INTERNAL ERROR" # see above return ann
def _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes, old_type, old_target): # due to legacy representation choices for Equivs (i.e. no # unique ID), support for attributes for Equivs would need # some extra work. Getting the easy non-Equiv case first. if attributes is not None: Messager.warning( '_create_equiv: attributes for Equiv annotation not supported yet, please tell the devs if you need this feature (mention "issue #799").') attributes = None ann = None if old_type is None: # new annotation # sanity assert old_target is None, '_create_equiv: incoherent args: old_type is None, old_target is not None (client/protocol error?)' ann = EquivAnnotation(type, [str(origin.id), str(target.id)], '') ann_obj.add_annotation(ann) mods.addition(ann) # TODO: attributes assert attributes is None, "INTERNAL ERROR" # see above else: # change to existing Equiv annotation. Other than the no-op # case, this remains TODO. assert projectconf.is_equiv_type( old_type), 'attempting to change equiv relation to non-equiv relation, operation not supported' # sanity assert old_target is not None, '_create_equiv: incoherent args: old_type is not None, old_target is None (client/protocol error?)' if old_type != type: Messager.warning( '_create_equiv: equiv type change not supported yet, please tell the devs if you need this feature (mention "issue #798").') if old_target != target.id: Messager.warning( '_create_equiv: equiv reselect not supported yet, please tell the devs if you need this feature (mention "issue #797").') # TODO: attributes assert attributes is None, "INTERNAL ERROR" # see above return ann
def __parse_configs(configstr, source, expected_sections): # top-level config structure is a set of term hierarchies # separated by lines consisting of "[SECTION]" where SECTION is # e.g. "entities", "relations", etc. # start by splitting config file lines by section section = "general" section_lines = {section: []} for ln, l in enumerate(configstr.split("\n")): m = re.match(r'^\s*\[(.*)\]\s*$', l) if m: section = m.group(1) if section not in expected_sections: Messager.warning( "Project configuration: unexpected section [%s] in %s. Ignoring contents." % (section, source), 5) if section not in section_lines: section_lines[section] = [] else: section_lines[section].append(l) # attempt to parse lines in each section as a term hierarchy configs = {} for s, sl in section_lines.items(): try: configs[s] = __read_term_hierarchy(sl) except: Messager.warning( "Project configuration: error parsing section [%s] in %s." % (s, source), 5) raise # verify that expected sections are present; replace with empty if not. for s in expected_sections: if s not in configs: Messager.warning( "Project configuration: missing section [%s] in %s. Configuration may be wrong." % (s, source), 5) configs[s] = [] return configs
def __parse_configs(configstr, source, expected_sections): # top-level config structure is a set of term hierarchies # separated by lines consisting of "[SECTION]" where SECTION is # e.g. "entities", "relations", etc. # start by splitting config file lines by section section = "general" section_lines = { section: [] } for ln, l in enumerate(configstr.split("\n")): m = re.match(r'^\s*\[(.*)\]\s*$', l) if m: section = m.group(1) if section not in expected_sections: Messager.warning("Project configuration: unexpected section [%s] in %s. Ignoring contents." % (section, source), 5) if section not in section_lines: section_lines[section] = [] else: section_lines[section].append(l) # attempt to parse lines in each section as a term hierarchy configs = {} for s, sl in section_lines.items(): try: configs[s] = __read_term_hierarchy(sl) except: Messager.warning("Project configuration: error parsing section [%s] in %s." % (s, source), 5) raise # verify that expected sections are present; replace with empty if not. for s in expected_sections: if s not in configs: Messager.warning("Project configuration: missing section [%s] in %s. Configuration may be wrong." % (s, source), 5) configs[s] = [] return configs
and not f.startswith('.'))) # The configuration is newer than the cache or getmtime(get_config_path(directory)) > cache_mtime): generate = True docstats = [] else: generate = False try: with open( cache_file_path.decode('utf-8').encode('utf-8'), 'rb') as cache_file: docstats = pickle_load(cache_file) except UnpicklingError: # Corrupt data, re-generate Messager.warning( 'Stats cache %s was corrupted; regenerating' % cache_file_path, -1) generate = True except EOFError: # Corrupt data, re-generate generate = True except OSError, e: Messager.warning( 'Failed checking file modification times for stats cache check; regenerating' ) generate = True # "header" and types stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")] if options_get_validation(directory) != 'none':