Пример #1
0
def get_base_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    # fill config for nodes for which annotation is configured

    # calculate once only (this can get heavy)
    all_connections = project_conf.all_connections()

    event_hierarchy = project_conf.get_event_type_hierarchy()
    event_types = _fill_type_configuration(event_hierarchy, project_conf,
                                           hotkey_by_type, all_connections)

    entity_hierarchy = project_conf.get_entity_type_hierarchy()
    entity_types = _fill_type_configuration(entity_hierarchy, project_conf,
                                            hotkey_by_type, all_connections)

    relation_hierarchy = project_conf.get_relation_type_hierarchy()
    relation_types = _fill_relation_configuration(relation_hierarchy,
                                                  project_conf, hotkey_by_type)

    # make visual config available also for nodes for which there is
    # no annotation config. Note that defaults (SPAN_DEFAULT etc.)
    # are included via get_drawing_types() if defined.
    unconfigured = [
        l for l in (project_conf.get_labels().keys() +
                    project_conf.get_drawing_types())
        if not project_conf.is_configured_type(l)
    ]
    unconf_types = _fill_visual_configuration(unconfigured, project_conf)

    return event_types, entity_types, relation_types, unconf_types
Пример #2
0
def allowed_to_read(real_path):
    data_path = path_join('/', relpath(real_path, DATA_DIR))
    # add trailing slash to directories, required to comply to robots.txt
    if isdir(real_path):
        data_path = '%s/' % (data_path)

    real_dir = dirname(real_path)
    robotparser = ProjectConfiguration(real_dir).get_access_control()
    if robotparser is None:
        return True  # default allow

    # 目录读取权限
    try:
        user = get_session().get('user')
        if user is None:
            Messager.error('没有登录!', duration=3)
            user = '******'
    except KeyError:
        Messager.error('没有登录!', duration=3)
        return False

    # print(user, file=sys.stderr)
    # display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1)
    # / tutorials /
    # / tutorials /
    # / tutorials / bio /
    # / tutorials / news /
    # / tutorials /
    # / tutorials / bio /
    # / tutorials / news /
    # print(data_path, file=sys.stderr)

    return robotparser.can_fetch(user, data_path)
def reverse_arc(collection, document, origin, target, type, attributes=None):
    directory = collection
    #undo_resp = {} # TODO
    real_dir = real_directory(directory)
    #mods = ModificationTracker() # TODO
    projectconf = ProjectConfiguration(real_dir)
    document = path_join(real_dir, document)
    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        if projectconf.is_equiv_type(type):
            Messager.warning('Cannot reverse Equiv arc')
        elif not projectconf.is_relation_type(type):
            Messager.warning('Can only reverse configured binary relations')
        else:
            # OK to reverse
            found = None
            # TODO: more sensible lookup
            for ann in ann_obj.get_relations():
                if (ann.arg1 == origin and ann.arg2 == target and
                    ann.type == type):
                    found = ann
                    break
            if found is None:
                Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type)))
            else:
                # found it; just adjust this
                found.arg1, found.arg2 = found.arg2, found.arg1
                # TODO: modification tracker

        json_response = {}
        json_response['annotations'] = _json_from_ann(ann_obj)
        return json_response
Пример #4
0
def possible_arc_types(collection, origin_type, target_type):
    directory = collection

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)
    response = {}

    try:
        possible = projectconf.arc_types_from_to(origin_type, target_type)

        # TODO: proper error handling
        if possible is None:
            Messager.error('Error selecting arc types!', -1)
        elif possible == []:
            # nothing to select
            response['html'] = generate_empty_fieldset()
            response['keymap'] = {}
            response['empty'] = True
        else:
            # XXX TODO: intentionally breaking this; KB shortcuts
            # should no longer be sent here. Remove 'keymap' and
            # 'html' args once clientside generation done.
            arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible)

            response['keymap'] = {}
            for k, p in arc_kb_shortcuts.items():
                response['keymap'][k] = "arc_"+p

            response['html']  = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts)
    except:
        Messager.error('Error selecting arc types!', -1)
        raise

    return response
Пример #5
0
def possible_arc_types(collection, origin_type, target_type):
    directory = collection

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)
    response = {}

    try:
        possible = projectconf.arc_types_from_to(origin_type, target_type)

        # TODO: proper error handling
        if possible is None:
            Messager.error('Error selecting arc types!', -1)
        elif possible == []:
            # nothing to select
            response['html'] = generate_empty_fieldset()
            response['keymap'] = {}
            response['empty'] = True
        else:
            # XXX TODO: intentionally breaking this; KB shortcuts
            # should no longer be sent here. Remove 'keymap' and
            # 'html' args once clientside generation done.
            arc_kb_shortcuts = {} #select_keyboard_shortcuts(possible)

            response['keymap'] = {}
            for k, p in arc_kb_shortcuts.items():
                response['keymap'][k] = "arc_"+p

            response['html']  = generate_arc_type_html(projectconf, possible, arc_kb_shortcuts)
    except:
        Messager.error('Error selecting arc types!', -1)
        raise

    return response
Пример #6
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from httplib import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                data = ann_obj.get_document_text().encode('utf-8')
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(url_soup.path),
                    data,
                    headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
Пример #7
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
            document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from httplib import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                    'Content-type': 'text/plain; charset=utf-8',
                    'Accept': 'application/json',
                    }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + (
                    '?' + url_soup.query if url_soup.query else '')
            try:
                data = ann_obj.get_document_text().encode('utf-8')
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request('POST',
                        # As per: http://bugs.python.org/issue11898
                        # Force the url to be an ascii string
                        str(url_soup.path),
                        data,
                        headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(tagger_token,
                        '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
Пример #8
0
def get_span_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    # fill config for nodes for which annotation is configured

    event_hierarchy = project_conf.get_event_type_hierarchy()
    event_types = _fill_type_configuration(event_hierarchy, project_conf,
                                           hotkey_by_type)

    entity_hierarchy = project_conf.get_entity_type_hierarchy()
    entity_types = _fill_type_configuration(entity_hierarchy, project_conf,
                                            hotkey_by_type)

    attribute_hierarchy = project_conf.get_attribute_type_hierarchy()
    attribute_types = _fill_attribute_configuration(attribute_hierarchy,
                                                    project_conf)

    relation_hierarchy = project_conf.get_relation_type_hierarchy()
    relation_types = _fill_relation_configuration(relation_hierarchy,
                                                  project_conf, hotkey_by_type)

    # make visual config available also for nodes for which there is
    # no annotation config ...
    unconfigured = [
        l for l in project_conf.get_labels()
        if not project_conf.is_configured_type(l)
    ]
    # ... and include the defaults.
    unconfigured += [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT]
    unconf_types = _fill_visual_configuration(unconfigured, project_conf)

    return event_types, entity_types, attribute_types, relation_types, unconf_types
Пример #9
0
def get_base_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    # fill config for nodes for which annotation is configured

    # calculate once only (this can get heavy)
    all_connections = project_conf.all_connections()
    
    event_hierarchy = project_conf.get_event_type_hierarchy()
    event_types = _fill_type_configuration(event_hierarchy,
            project_conf, hotkey_by_type, all_connections)

    entity_hierarchy = project_conf.get_entity_type_hierarchy()
    entity_types = _fill_type_configuration(entity_hierarchy,
            project_conf, hotkey_by_type, all_connections)

    relation_hierarchy = project_conf.get_relation_type_hierarchy()
    relation_types = _fill_relation_configuration(relation_hierarchy,
            project_conf, hotkey_by_type)

    # make visual config available also for nodes for which there is
    # no annotation config. Note that defaults (SPAN_DEFAULT etc.)
    # are included via get_drawing_types() if defined.
    unconfigured = [l for l in (project_conf.get_labels().keys() +
                                project_conf.get_drawing_types()) if 
                    not project_conf.is_configured_type(l)]
    unconf_types = _fill_visual_configuration(unconfigured, project_conf)

    return event_types, entity_types, relation_types, unconf_types
Пример #10
0
def get_attribute_types(directory):
    project_conf = ProjectConfiguration(directory)

    entity_attribute_hierarchy = project_conf.get_entity_attribute_type_hierarchy()
    entity_attribute_types = _fill_attribute_configuration(entity_attribute_hierarchy, project_conf)
    
    relation_attribute_hierarchy = project_conf.get_relation_attribute_type_hierarchy()
    relation_attribute_types = _fill_attribute_configuration(relation_attribute_hierarchy, project_conf)

    event_attribute_hierarchy = project_conf.get_event_attribute_type_hierarchy()
    event_attribute_types = _fill_attribute_configuration(event_attribute_hierarchy, project_conf)

    return entity_attribute_types, relation_attribute_types, event_attribute_types
Пример #11
0
def get_attribute_types(directory):
    project_conf = ProjectConfiguration(directory)

    entity_attribute_hierarchy = project_conf.get_entity_attribute_type_hierarchy()
    entity_attribute_types = _fill_attribute_configuration(entity_attribute_hierarchy, project_conf)
    
    relation_attribute_hierarchy = project_conf.get_relation_attribute_type_hierarchy()
    relation_attribute_types = _fill_attribute_configuration(relation_attribute_hierarchy, project_conf)

    event_attribute_hierarchy = project_conf.get_event_attribute_type_hierarchy()
    event_attribute_types = _fill_attribute_configuration(event_attribute_hierarchy, project_conf)

    return entity_attribute_types, relation_attribute_types, event_attribute_types
Пример #12
0
def get_span_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    # fill config for nodes for which annotation is configured

    event_hierarchy = project_conf.get_event_type_hierarchy()
    event_types = _fill_type_configuration(event_hierarchy,
            project_conf, hotkey_by_type)

    entity_hierarchy = project_conf.get_entity_type_hierarchy()
    entity_types = _fill_type_configuration(entity_hierarchy,
            project_conf, hotkey_by_type)

    attribute_hierarchy = project_conf.get_attribute_type_hierarchy()
    attribute_types = _fill_attribute_configuration(attribute_hierarchy, project_conf)

    relation_hierarchy = project_conf.get_relation_type_hierarchy()
    relation_types = _fill_relation_configuration(relation_hierarchy,
            project_conf, hotkey_by_type)

    # make visual config available also for nodes for which there is
    # no annotation config ...
    unconfigured = [l for l in project_conf.get_labels() if 
                    not project_conf.is_configured_type(l)]
    # ... and include the defaults.
    unconfigured += [VISUAL_SPAN_DEFAULT, VISUAL_ARC_DEFAULT]
    unconf_types = _fill_visual_configuration(unconfigured, project_conf)

    return event_types, entity_types, attribute_types, relation_types, unconf_types
Пример #13
0
def suggest_span_types(collection, document, start, end, text, model):

    pconf = ProjectConfiguration(real_directory(collection))
    for _, _, model_str, model_url in pconf.get_disambiguator_config():
        if model_str == model:
            break
    else:
        # We were unable to find a matching model
        raise SimSemConnectionNotConfiguredError

    try:
        quoted_text = quote_plus(text)
        resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT)
    except URLError:
        # TODO: Could give more details
        raise SimSemConnectionError

    json = loads(resp.read())

    preds = json['result'][text.decode('utf-8')]

    selected_preds = []
    conf_sum = 0
    for cat, conf in preds:
        selected_preds.append((
            cat,
            conf,
        ))
        conf_sum += conf
        if conf_sum >= CUT_OFF:
            break

    log_annotation(collection, document, 'DONE', 'suggestion', [
        None,
        None,
        text,
    ] + [
        selected_preds,
    ])

    # array so that server can control presentation order in UI
    # independently from scores if needed
    return {
        'types': selected_preds,
        'collection': collection,  # echo for reference
        'document': document,
        'start': start,
        'end': end,
        'text': text,
    }
Пример #14
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                conn.request(
                    'POST',
                    url_soup.path,
                    # The document text as body
                    ann_obj.get_document_text().encode('utf8'),
                    headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
        finally:
def main(argv=None):
    import sys
    import os

    if argv is None:
        argv = sys.argv
    arg = argparser().parse_args(argv[1:])

    for fn in arg.files:
        try:
            projectconf = ProjectConfiguration(os.path.dirname(fn))
            # remove ".a2" or ".rel" suffixes for Annotations to prompt
            # parsing of .a1 also.
            # (TODO: temporarily removing .ann also to work around a
            # bug in TextAnnotations, but this should not be necessary.)
            nosuff_fn = fn.replace(".a2", "").replace(".rel",
                                                      "").replace(".ann", "")
            with annotation.TextAnnotations(nosuff_fn) as ann_obj:
                issues = verify_annotation(ann_obj, projectconf)
                for i in issues:
                    print("%s:\t%s" % (fn, i.human_readable_str()))
        except annotation.AnnotationFileNotFoundError:
            print("%s:\tFailed check: file not found" % fn, file=sys.stderr)
        except annotation.AnnotationNotFoundError as e:
            print("%s:\tFailed check: %s" % (fn, e), file=sys.stderr)

    if arg.verbose:
        print("Check complete.", file=sys.stderr)
Пример #16
0
def get_attribute_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    entity_attribute_hierarchy = project_conf.get_entity_attribute_type_hierarchy()
    entity_attribute_types = _fill_attribute_configuration(entity_attribute_hierarchy, project_conf, hotkey_by_type)
    
    relation_attribute_hierarchy = project_conf.get_relation_attribute_type_hierarchy()
    relation_attribute_types = _fill_attribute_configuration(relation_attribute_hierarchy, project_conf, hotkey_by_type)

    event_attribute_hierarchy = project_conf.get_event_attribute_type_hierarchy()
    event_attribute_types = _fill_attribute_configuration(event_attribute_hierarchy, project_conf, hotkey_by_type)

    return entity_attribute_types, relation_attribute_types, event_attribute_types
Пример #17
0
def create_comment(collection, document, id, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    projectconf = ProjectConfiguration(real_dir)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    # XXX what is this doing here?
    # path_split(document)[0]

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        _set_special_comments(ann_obj, id, comment, mods, undo_resp=undo_resp)

        mods_json = mods.json_response()
        if undo_resp:
            mods_json['undo'] = json_dumps(undo_resp)
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #18
0
def create_arc(collection, document, origin, target, type, attributes=None,
               old_type=None, old_target=None, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        # TODO: make consistent across the different editing
        # functions, integrate ann_obj initialization and checks
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin)
        target = ann_obj.get_ann_by_id(target)

        # if there is a previous annotation and the arcs aren't in
        # the same category (e.g. relation vs. event arg), process
        # as delete + create instead of update.
        if old_type is not None and (
                projectconf.is_relation_type(old_type) !=
                projectconf.is_relation_type(type) or
                projectconf.is_equiv_type(old_type) !=
                projectconf.is_equiv_type(type)):
            _delete_arc_with_ann(origin.id, old_target, old_type, mods,
                                 ann_obj, projectconf)
            old_target, old_type = None, None

        if projectconf.is_equiv_type(type):
            ann = _create_equiv(ann_obj, projectconf, mods, origin, target,
                                type, attributes, old_type, old_target)

        elif projectconf.is_relation_type(type):
            ann = _create_relation(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)
        else:
            ann = _create_argument(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)

        # process comments
        if ann is not None:
            _set_comments(ann_obj, ann, comment, mods,
                          undo_resp=undo_resp)
        elif comment is not None:
            Messager.warning(
                'create_arc: non-empty comment for None annotation (unsupported type for comment?)')

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #19
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
            document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                    'Content-type': 'text/plain; charset=utf-8',
                    'Accept': 'application/json',
                    }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + (
                    '?' + url_soup.query if url_soup.query else '')
            try:
                conn.request('POST', url_soup.path,
                        # The document text as body
                        ann_obj.get_document_text().encode('utf8'),
                        headers=req_headers)
            except SocketError, e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(tagger_token,
                        '%s %s' % (resp.status, resp.reason))
        finally:
Пример #20
0
def create_arc(collection, document, origin, target, type, attributes=None,
        old_type=None, old_target=None, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        # TODO: make consistent across the different editing
        # functions, integrate ann_obj initialization and checks
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin) 
        target = ann_obj.get_ann_by_id(target)

        if projectconf.is_equiv_type(type):
            ann =_create_equiv(ann_obj, projectconf, mods, origin, target, 
                               type, attributes, old_type, old_target)

        elif projectconf.is_relation_type(type):
            ann = _create_relation(ann_obj, projectconf, mods, origin, target, 
                                   type, attributes, old_type, old_target)
        else:
            ann = _create_argument(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)

        # process comments
        if ann is not None:
            _set_comments(ann_obj, ann, comment, mods,
                          undo_resp=undo_resp)
        elif comment is not None:
            Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)')
            

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #21
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    doc_path = path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
            document)) as ann_obj:

        try:
            # Note: Can we actually fit a whole document in here?
            quoted_doc_text = quote_plus(ann_obj.get_document_text())
            resp = urlopen(tagger_service_url % quoted_doc_text, None,
                QUERY_TIMEOUT)
        except URLError:
            raise TaggerConnectionError(tagger_token)

        # TODO: Check for errors
        json_resp = loads(resp.read())

        mods = ModificationTracker()

        for ann_data in json_resp.itervalues():
            offsets = ann_data['offsets']
            # Note: We do not support discontinuous spans at this point
            assert len(offsets) == 1, 'discontinuous/null spans'
            start, end = offsets[0]
            _id = ann_obj.get_new_id('T')
            tb = TextBoundAnnotationWithText(
                    start, end,
                    _id,
                    ann_data['type'],
                    ann_data['text']
                    )
            mods.addition(tb)
            ann_obj.add_annotation(tb)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Пример #22
0
def suggest_span_types(collection, document, start, end, text, model):

    pconf = ProjectConfiguration(real_directory(collection))
    for _, _, model_str, model_url in pconf.get_disambiguator_config():
        if model_str == model:
            break
    else:
        # We were unable to find a matching model
        raise SimSemConnectionNotConfiguredError

    try:
        quoted_text = quote_plus(text)
        resp = urlopen(model_url % quoted_text, None, QUERY_TIMEOUT)
    except URLError:
        # TODO: Could give more details
        raise SimSemConnectionError
    
    json = loads(resp.read())

    preds = json['result'][text.decode('utf-8')]

    selected_preds = []
    conf_sum = 0
    for cat, conf in preds:
        selected_preds.append((cat, conf, ))
        conf_sum += conf
        if conf_sum >= CUT_OFF:
            break

    log_annotation(collection, document, 'DONE', 'suggestion',
            [None, None, text, ] + [selected_preds, ])

    # array so that server can control presentation order in UI
    # independently from scores if needed
    return { 'types': selected_preds,
             'collection': collection, # echo for reference
             'document': document,
             'start': start,
             'end': end,
             'text': text,
             }
Пример #23
0
def _get_db_path(database, collection):
    if collection is None:
        # TODO: default to WORK_DIR config?
        return (None, Simstring.DEFAULT_UNICODE)
    else:
        conf_dir = real_directory(collection)
        projectconf = ProjectConfiguration(conf_dir)
        norm_conf = projectconf.get_normalization_config()
        try:
            conf_dir = real_directory(collection)
            projectconf = ProjectConfiguration(conf_dir)
            norm_conf = projectconf.get_normalization_config()
            for entry in norm_conf:
                # TODO THIS IS WRONG
                dbname, dbpath, dbunicode = entry[0], entry[3], entry[4]
                if dbname == database:
                    return (dbpath, dbunicode)
            # not found in config.
            Messager.warning('DB ' + database + ' not defined in config for ' +
                             collection + ', falling back on default.')
            return (None, Simstring.DEFAULT_UNICODE)
        except Exception:
            # whatever goes wrong, just warn and fall back on the default.
            Messager.warning('Failed to get DB path from config for ' +
                             collection + ', falling back on default.')
            return (None, Simstring.DEFAULT_UNICODE)
Пример #24
0
def _get_db_path(database, collection):
    if collection is None:
        # TODO: default to WORK_DIR config?
        return None
    else:
        try:
            conf_dir = real_directory(collection)
            projectconf = ProjectConfiguration(conf_dir)
            norm_conf = projectconf.get_normalization_config()
            for entry in norm_conf:
                dbname, dbpath = entry[0], entry[3]
                if dbname == database:
                    return dbpath
            # not found in config.
            Messager.warning('DB ' + database + ' not defined in config for ' +
                             collection + ', falling back on default.')
            return None
        except Exception:
            # whatever goes wrong, just warn and fall back on the default.
            Messager.warning('Failed to get DB path from config for ' +
                             collection + ', falling back on default.')
            return None
Пример #25
0
def allowed_to_read(real_path):
    data_path = path_join('/', relpath(real_path, DATA_DIR))
    # add trailing slash to directories, required to comply to robots.txt
    if isdir(real_path):
        data_path = '%s/' % ( data_path )
        
    real_dir = dirname(real_path)
    robotparser = ProjectConfiguration(real_dir).get_access_control()
    if robotparser is None:
        return True # default allow

    try:
        user = get_session().get('user')
    except KeyError:
        user = None

    if user is None:
        user = '******'

    #display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1)

    return robotparser.can_fetch(user, data_path)
Пример #26
0
def allowed_to_read(real_path):
    data_path = path_join('/', relpath(real_path, DATA_DIR))
    # add trailing slash to directories, required to comply to robots.txt
    if isdir(real_path):
        data_path = '%s/' % (data_path)

    real_dir = dirname(real_path)
    robotparser = ProjectConfiguration(real_dir).get_access_control()
    if robotparser is None:
        return True  # default allow

    try:
        user = get_session().get('user')
    except KeyError:
        user = None

    if user is None:
        user = '******'

    #display_message('Path: %s, dir: %s, user: %s, ' % (data_path, real_dir, user), type='error', duration=-1)

    return robotparser.can_fetch(user, data_path)
Пример #27
0
def reverse_arc(collection, document, origin, target, type, attributes=None):
    directory = collection
    # undo_resp = {} # TODO
    real_dir = real_directory(directory)
    # mods = ModificationTracker() # TODO
    projectconf = ProjectConfiguration(real_dir)

    document = urllib.parse.unquote(document)
    document = path_join(real_dir, document)
    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        if projectconf.is_equiv_type(type):
            Messager.warning('Cannot reverse Equiv arc')
        elif not projectconf.is_relation_type(type):
            Messager.warning('Can only reverse configured binary relations')
        else:
            # OK to reverse
            found = None
            # TODO: more sensible lookup
            for ann in ann_obj.get_relations():
                if (ann.arg1 == origin and ann.arg2 == target
                        and ann.type == type):
                    found = ann
                    break
            if found is None:
                Messager.error(
                    'reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)'
                    % (str(origin), str(target), str(type)))
            else:
                # found it; just adjust this
                found.arg1, found.arg2 = found.arg2, found.arg1
                # TODO: modification tracker

        json_response = {}
        json_response['annotations'] = _json_from_ann(ann_obj)
        return json_response
Пример #28
0
def getAnnObject(collection, document):
    try:
        real_dir = real_directory(collection)
    except:
        real_dir = collection
    app_path = WORK_DIR + "/application/"
    full_name = collection + document
    full_name = full_name.replace("/", "")
    if (os.path.isfile(app_path + full_name)):
        temp = open(app_path + full_name, 'rb')
        ann = pickle_load(temp)
        temp.close()
    else:
        ann = TextAnnotations(real_dir + document)
        ann = SimpleAnnotations(ann)
        ann.folia = {}
        try:
            #TODO:good error message
            ann.folia = get_extra_info(collection, document)
        except Exception as e:
            ann.folia = {}
            Messager.error('Error: get extra folia info() failed: %s' % e)
    #Validation:
    try:
        docdir = os.path.dirname(ann._document)
        string = session.load_conf()["config"]
        val = json.loads(string)["validationOn"]
        #validate if config enables it and if it's not already done.
        if val:
            if not ann.validated:
                projectconf = ProjectConfiguration(docdir)
                issues = verify_annotation(ann, projectconf)
            else:
                issues = ann.issues
        else:
            ann.validated = False
            issues = []
    except session.NoSessionError:
        issues = []
    except KeyError:
        issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: validation failed: %s' % e)
    ann.issues = issues
    temp = open(app_path + full_name, 'wb')
    pickle_dump(ann, temp)
    temp.close()
    return ann
Пример #29
0
def convert(path,doc):
    #path is path to the file without extension
    projectconf = ProjectConfiguration(path)
    path = path_join(path,doc)
    ann = Annotations(path+".ann")
    doc = build_text_structure(ann,path+".txt")
    add_relations(doc,ann)
    add_comments(doc,ann)
    #~ ent_set=xml(build_entity_set(doc))
    #~ rel_set=xml(build_relations_set(doc))
    #~ temp=open ("entiteit_set.xml",'w')
    #~ temp.write(ent_set)
    #~ temp.close()
    #~ rel=open ("relation_set.xml",'w')
    #~ rel.write(rel_set)
    #~ rel.close()
    doc.save(path+".xml")
Пример #30
0
def delete_arc(collection, document, origin, target, type):
    directory = collection

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        _delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf)

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #31
0
def get_span_types(directory):
    project_conf = ProjectConfiguration(directory)

    keymap = project_conf.get_kb_shortcuts()
    hotkey_by_type = dict((v, k) for k, v in keymap.iteritems())

    event_hierarchy = project_conf.get_event_type_hierarchy()
    event_types = _get_subtypes_for_type(event_hierarchy,
            project_conf, hotkey_by_type, directory)

    entity_hierarchy = project_conf.get_entity_type_hierarchy()
    entity_types = _get_subtypes_for_type(entity_hierarchy,
            project_conf, hotkey_by_type, directory)

    attribute_hierarchy = project_conf.get_attribute_type_hierarchy()
    attribute_types = _get_attribute_type_info(attribute_hierarchy, project_conf, directory)

    relation_hierarchy = project_conf.get_relation_type_hierarchy()
    relation_types = _get_subtypes_for_type(relation_hierarchy,
            project_conf, hotkey_by_type, directory)

    return event_types, entity_types, attribute_types, relation_types
Пример #32
0
def get_annotator_config(directory):
    # TODO: "annotator" is a very confusing term for a web service
    # that does automatic annotation in the context of a tool
    # where most annotators are expected to be human. Rethink.
    return ProjectConfiguration(directory).get_annotator_config()
Пример #33
0
def _enrich_json_with_data(j_dic, ann_obj):
    # TODO: figure out if there's a reason for all the unicode()
    # invocations here; remove if not.

    # We collect trigger ids to be able to link the textbound later on
    trigger_ids = set()
    for event_ann in ann_obj.get_events():
        trigger_ids.add(event_ann.trigger)
        j_dic['events'].append([
            unicode(event_ann.id),
            unicode(event_ann.trigger), event_ann.args
        ])

    for rel_ann in ann_obj.get_relations():
        j_dic['relations'].append([
            unicode(rel_ann.id),
            unicode(rel_ann.type),
            [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)]
        ])

    for tb_ann in ann_obj.get_textbounds():
        #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end]
        j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans]

        # If we spotted it in the previous pass as a trigger for an
        # event or if the type is known to be an event type, we add it
        # as a json trigger.
        # TODO: proper handling of disconnected triggers. Currently
        # these will be erroneously passed as 'entities'
        if unicode(tb_ann.id) in trigger_ids:
            j_dic['triggers'].append(j_tb)
            # special case for BioNLP ST 2013 format: send triggers
            # also as entities for those triggers that are referenced
            # from annotations other than events (#926).
            if BIONLP_ST_2013_COMPATIBILITY:
                if tb_ann.id in ann_obj.externally_referenced_triggers:
                    try:
                        j_dic['entities'].append(j_tb)
                    except KeyError:
                        j_dic['entities'] = [
                            j_tb,
                        ]
        else:
            try:
                j_dic['entities'].append(j_tb)
            except KeyError:
                j_dic['entities'] = [
                    j_tb,
                ]

    for eq_ann in ann_obj.get_equivs():
        j_dic['equivs'].append(
            (['*', eq_ann.type] + [e for e in eq_ann.entities]))

    for att_ann in ann_obj.get_attributes():
        j_dic['attributes'].append([
            unicode(att_ann.id),
            unicode(att_ann.type),
            unicode(att_ann.target), att_ann.value
        ])

    for norm_ann in ann_obj.get_normalizations():
        j_dic['normalizations'].append([
            unicode(norm_ann.id),
            unicode(norm_ann.type),
            unicode(norm_ann.target),
            unicode(norm_ann.refdb),
            unicode(norm_ann.refid),
            unicode(norm_ann.reftext)
        ])

    for com_ann in ann_obj.get_oneline_comments():
        comment = [
            unicode(com_ann.target),
            unicode(com_ann.type),
            com_ann.tail.strip()
        ]
        try:
            j_dic['comments'].append(comment)
        except KeyError:
            j_dic['comments'] = [
                comment,
            ]

    if ann_obj.failed_lines:
        error_msg = 'Unable to parse the following line(s):\n%s' % (
            '\n'.join([
                (
                    '%s: %s' % (
                        # The line number is off by one
                        unicode(line_num + 1),
                        unicode(ann_obj[line_num]))).strip()
                for line_num in ann_obj.failed_lines
            ]))
        Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3)

    j_dic['mtime'] = ann_obj.ann_mtime
    j_dic['ctime'] = ann_obj.ann_ctime

    try:
        # XXX avoid digging the directory from the ann_obj
        import os
        docdir = os.path.dirname(ann_obj._document)
        if options_get_validation(docdir) in (
                'all',
                'full',
        ):
            from verify_annotations import verify_annotation
            projectconf = ProjectConfiguration(docdir)
            issues = verify_annotation(ann_obj, projectconf)
        else:
            issues = []
    except Exception, e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: verify_annotation() failed: %s' % e, -1)
Пример #34
0
def get_disambiguator_config(directory):
    return ProjectConfiguration(directory).get_disambiguator_config()
Пример #35
0
def get_normalization_config(directory):
    return ProjectConfiguration(directory).get_normalization_config()
Пример #36
0
def delete_arc(collection, document, origin, target, type):
    directory = collection

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        # This can be an event or an equiv
        #TODO: Check for None!
        try:
            event_ann = ann_obj.get_ann_by_id(origin)
            # Try if it is an event
            arg_tup = (type, unicode(target))
            if arg_tup in event_ann.args:
                before = unicode(event_ann)
                event_ann.args.remove(arg_tup)
                mods.change(before, event_ann)

                '''
                if not event_ann.args:
                    # It was the last argument tuple, remove it all
                    try:
                        ann_obj.del_annotation(event_ann)
                        mods.deletion(event_ann)
                    except DependingAnnotationDeleteError, e:
                        #XXX: Old message api
                        print 'Content-Type: application/json\n'
                        print dumps(e.json_error_response())
                        return
                '''
            else:
                # What we were to remove did not even exist in the first place
                pass

        except AttributeError:
            projectconf = ProjectConfiguration(real_dir)
            if projectconf.is_equiv_type(type):
                # It is an equiv then?
                #XXX: Slow hack! Should have a better accessor! O(eq_ann)
                for eq_ann in ann_obj.get_equivs():
                    # We don't assume that the ids only occur in one Equiv, we
                    # keep on going since the data "could" be corrupted
                    if (unicode(origin) in eq_ann.entities
                            and unicode(target) in eq_ann.entities):
                        before = unicode(eq_ann)
                        eq_ann.entities.remove(unicode(origin))
                        eq_ann.entities.remove(unicode(target))
                        mods.change(before, eq_ann)

                    if len(eq_ann.entities) < 2:
                        # We need to delete this one
                        try:
                            ann_obj.del_annotation(eq_ann)
                            mods.deletion(eq_ann)
                        except DependingAnnotationDeleteError, e:
                            #TODO: This should never happen, dep on equiv
                            #print 'Content-Type: application/json\n'
                            # TODO: Proper exception here!
                            Messager.error(e.json_error_response())
                            return {}
            elif type in projectconf.get_relation_types():
                for ann in ann_obj.get_relations():
                    if ann.type == type and ann.arg1 == origin and ann.arg2 == target:
                        ann_obj.del_annotation(ann)
                        mods.deletion(ann)
                        break
            else:
Пример #37
0
def get_search_config(directory):
    return ProjectConfiguration(directory).get_search_config()
def create_arc(collection, document, origin, target, type, attributes=None,
        old_type=None, old_target=None, comment=None):
    directory = collection
    undo_resp = {}

    real_dir = real_directory(directory)

    mods = ModificationTracker()

    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        #Messager.info('Creating span collection(' + str(collection) + ') document(' + str(document) + ') origin(' + str(origin) + ') target(' + str(target) + ') exists(' + str(ann_obj.ann_exists_in_offset('Arg1:' + str(origin) + '_Arg2:' + str(target))) + ')' )
        if ann_obj.ann_exists_in_offset('Arg1:' + str(origin) + '_Arg2:' + str(target)):
            raise SpanOffsetOverlapError([str(ann_obj.get_ann_by_id(origin)).split()[-1], str(ann_obj.get_ann_by_id(target)).split()[-1]])

        # bail as quick as possible if read-only 
        # TODO: make consistent across the different editing
        # functions, integrate ann_obj initialization and checks
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin) 
        target = ann_obj.get_ann_by_id(target)

        # if there is a previous annotation and the arcs aren't in
        # the same category (e.g. relation vs. event arg), process
        # as delete + create instead of update.
        if old_type is not None and (
            projectconf.is_relation_type(old_type) != 
            projectconf.is_relation_type(type) or
            projectconf.is_equiv_type(old_type) !=
            projectconf.is_equiv_type(type)):
            _delete_arc_with_ann(origin.id, old_target, old_type, mods, 
                                 ann_obj, projectconf)
            old_target, old_type = None, None

        if projectconf.is_equiv_type(type):
            ann =_create_equiv(ann_obj, projectconf, mods, origin, target, 
                               type, attributes, old_type, old_target)

        elif projectconf.is_relation_type(type):
            ann = _create_relation(ann_obj, projectconf, mods, origin, target, 
                                   type, attributes, old_type, old_target)
        else:
            ann = _create_argument(ann_obj, projectconf, mods, origin, target,
                                   type, attributes, old_type, old_target)

        # process comments
        if ann is not None:
            _set_comments(ann_obj, ann, comment, mods,
                          undo_resp=undo_resp)
        elif comment is not None:
            Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)')
            

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #39
0
def delete_arc(collection, document, origin, target, type):
    directory = collection

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        # This can be an event or an equiv
        #TODO: Check for None!
        try:
            event_ann = ann_obj.get_ann_by_id(origin)
            # Try if it is an event
            arg_tup = (type, unicode(target))
            if arg_tup in event_ann.args:
                before = unicode(event_ann)
                event_ann.args.remove(arg_tup)
                mods.change(before, event_ann)

                '''
                if not event_ann.args:
                    # It was the last argument tuple, remove it all
                    try:
                        ann_obj.del_annotation(event_ann)
                        mods.deletion(event_ann)
                    except DependingAnnotationDeleteError, e:
                        #XXX: Old message api
                        print 'Content-Type: application/json\n'
                        print dumps(e.json_error_response())
                        return
                '''
            else:
                # What we were to remove did not even exist in the first place
                pass

        except AttributeError:
            projectconf = ProjectConfiguration(real_dir)
            if projectconf.is_equiv_type(type):
                # It is an equiv then?
                #XXX: Slow hack! Should have a better accessor! O(eq_ann)
                for eq_ann in ann_obj.get_equivs():
                    # We don't assume that the ids only occur in one Equiv, we
                    # keep on going since the data "could" be corrupted
                    if (unicode(origin) in eq_ann.entities
                            and unicode(target) in eq_ann.entities):
                        before = unicode(eq_ann)
                        eq_ann.entities.remove(unicode(origin))
                        eq_ann.entities.remove(unicode(target))
                        mods.change(before, eq_ann)

                    if len(eq_ann.entities) < 2:
                        # We need to delete this one
                        try:
                            ann_obj.del_annotation(eq_ann)
                            mods.deletion(eq_ann)
                        except DependingAnnotationDeleteError, e:
                            #TODO: This should never happen, dep on equiv
                            #print 'Content-Type: application/json\n'
                            # TODO: Proper exception here!
                            Messager.error(e.json_error_response())
                            return {}
            elif type in projectconf.get_relation_types():
                for ann in ann_obj.get_relations():
                    if ann.type == type and ann.arg1 == origin and ann.arg2 == target:
                        ann_obj.del_annotation(ann)
                        mods.deletion(ann)
                        break
            else:
Пример #40
0
def create_arc(collection, document, origin, target, type, old_type=None, old_target=None):
    directory = collection

    real_dir = real_directory(directory)
    mods = ModificationTracker()

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # Dirty hack to bail as quick as possible if read-only

        # TODO: why only here? The checking of readonly should be
        # consistent across the different editing functions.
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin)
        target = ann_obj.get_ann_by_id(target)

        # Ugly check, but we really get no other information
        if type == "Equiv":
            # It is an Equiv
            if old_type == "Equiv":
                # "Change" from Equiv to Equiv is harmless
                # TODO: some message needed?
                pass
            else:
                assert old_type is None, "attempting to change Equiv, not supported"
                ann = EquivAnnotation(type, [unicode(origin.id), unicode(target.id)], "")
                ann_obj.add_annotation(ann)
                mods.addition(ann)
        elif type in projectconf.get_relation_types():
            if old_type is not None or old_target is not None:
                assert type in projectconf.get_relation_types(), (
                    'attempting to convert relation to non-relation "%s" ' % (target.type,)
                ) + ("(legit types: %s)" % (unicode(projectconf.get_relation_types()),))

                sought_target = old_target if old_target is not None else target.id
                sought_type = old_type if old_type is not None else type

                # We are to change the type and/or target
                found = None
                for ann in ann_obj.get_relations():
                    if ann.arg2 == sought_target and ann.type == sought_type:
                        found = ann
                        break

                # Did it exist and is changed?, otherwise we do nothing
                if found is not None and (found.arg2 != target.id or found.type != type):
                    before = unicode(found)
                    found.arg2 = target.id
                    found.type = type
                    mods.change(before, found)
            else:
                # Create a new annotation

                # TODO: Assign a suitable letter
                new_id = ann_obj.get_new_id("R")
                rel = projectconf.get_relation_by_type(type)
                assert rel is not None and len(rel.arg_list) == 2
                a1l, a2l = rel.arg_list
                ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, "\t")
                mods.addition(ann)
                ann_obj.add_annotation(ann)
        else:
            try:
                arg_tup = (type, unicode(target.id))

                # Is this an addition or an update?
                if old_type is None and old_target is None:
                    if arg_tup not in origin.args:
                        before = unicode(origin)
                        origin.args.append(arg_tup)
                        mods.change(before, origin)
                    else:
                        # It already existed as an arg, we were called to do nothing...
                        pass
                else:
                    # Construct how the old arg would have looked like
                    old_arg_tup = (type if old_type is None else old_type, target if old_target is None else old_target)

                    if old_arg_tup in origin.args and arg_tup not in origin.args:
                        before = unicode(origin)
                        origin.args.remove(old_arg_tup)
                        origin.args.append(arg_tup)
                        mods.change(before, origin)
                    else:
                        # Collision etc. don't do anything
                        pass
            except AttributeError:
                # The annotation did not have args, it was most likely an entity
                # thus we need to create a new Event...
                new_id = ann_obj.get_new_id("E")
                ann = EventAnnotation(origin.id, [arg_tup], new_id, origin.type, "")
                ann_obj.add_annotation(ann)
                mods.addition(ann)

        if DEBUG:
            mods_json = mods.json_response()
        else:
            mods_json = {}

        # Hack since we don't have the actual text, should use a factory?
        txt_file_path = ann_obj.get_document() + "." + TEXT_FILE_SUFFIX
        j_dic = _json_from_ann_and_txt(ann_obj, txt_file_path)

        mods_json["annotations"] = j_dic
        return mods_json
Пример #41
0
def filter_layers(ann, path):
    #Added by Sander Naert to disable the visualisation of same annotations
    try:
        string = session.load_conf()["config"]
        val = json.loads(string)["layers"]
    except session.NoSessionError:
        val = []
    except KeyError:
        val = []
    except Exception as e:
        val = []
        Messager.error("Error while enabling/disabling layers: " + str(e))
    proj = ProjectConfiguration(path)
    forbidden_entities = set()
    forbidden_ann = []
    for i in val:
        forbidden_ann.append(i)
    temp_array = []

    #Remove forbidden entities
    for i in ann["entities"]:
        if i[1] in forbidden_ann:
            forbidden_entities.add(i[0])
        else:
            temp_array.append(i)
    ann["entities"] = temp_array

    #Remove forbidden triggers
    temp_array = []
    forbidden_events = []
    for i in ann["triggers"]:
        if i[1] in forbidden_ann:
            forbidden_events.append(i[0])
        else:
            temp_array.append(i)
    ann["triggers"] = temp_array

    #Remove forbidden events
    temp_array = []
    for i in ann["events"]:
        if i[1] in forbidden_events:
            pass
        else:
            #delete references to removed entities
            i[2][:] = [
                role for role in i[2] if not role[1] in forbidden_entities
            ]
            temp_array.append(i)
    ann["events"] = temp_array

    #Remove forbidden relations
    temp_array = []
    for i in ann["relations"]:
        if i[1] in forbidden_ann:
            pass
        else:
            #if an arg points to an forbidden_ent then also remove this relation
            roles = [role for role in i[2] if role[1] in forbidden_entities]
            if not roles:
                temp_array.append(i)
    ann["relations"] = temp_array

    #Remove forbidden attributes
    temp_array = []
    for i in ann["attributes"]:
        if i[1] in forbidden_ann:
            pass
        elif not i[2] in forbidden_entities:
            temp_array.append(i)
    ann["attributes"] = temp_array

    return ann
Пример #42
0
def _create_span(collection, document, offsets, _type, attributes=None,
                 normalizations=None, _id=None, comment=None):

    if _offset_overlaps(offsets):
        raise SpanOffsetOverlapError(offsets)

    directory = collection
    undo_resp = {}

    _attributes = _parse_attributes(attributes)
    _normalizations = _parse_span_normalizations(normalizations)

    #log_info('ATTR: %s' %(_attributes, ))

    real_dir = real_directory(directory)
    document = path_join(real_dir, document)

    projectconf = ProjectConfiguration(real_dir)

    txt_file_path = document + '.' + TEXT_FILE_SUFFIX

    path_split(document)[0]

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        mods = ModificationTracker()

        if _id is not None:
            # We are to edit an existing annotation
            tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf,
                                       _attributes, _type, undo_resp=undo_resp)
        else:
            # We are to create a new annotation
            tb_ann, e_ann = __create_span(
                ann_obj, mods, _type, offsets, txt_file_path, projectconf, _attributes)

            undo_resp['action'] = 'add_tb'
            if e_ann is not None:
                undo_resp['id'] = e_ann.id
            else:
                undo_resp['id'] = tb_ann.id

        # Determine which annotation attributes, normalizations,
        # comments etc. should be attached to. If there's an event,
        # attach to that; otherwise attach to the textbound.
        if e_ann is not None:
            # Assign to the event, not the trigger
            target_ann = e_ann
        else:
            target_ann = tb_ann

        # Set attributes
        _set_attributes(ann_obj, target_ann, _attributes, mods,
                        undo_resp=undo_resp)

        # Set normalizations
        _set_normalizations(ann_obj, target_ann, _normalizations, mods,
                            undo_resp=undo_resp)

        # Set comments
        if tb_ann is not None:
            _set_comments(ann_obj, target_ann, comment, mods,
                          undo_resp=undo_resp)

        if tb_ann is not None:
            mods_json = mods.json_response()
        else:
            # Hack, probably we had a new-line in the span
            mods_json = {}
            Messager.error(
                'Text span contained new-line, rejected',
                duration=3)

        if undo_resp:
            mods_json['undo'] = json_dumps(undo_resp)
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #43
0
Файл: tag.py Проект: WeSIG/Delta
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    print("tagger", tagger, file=sys.stderr)
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    # print("path_join(real_directory(collection), document)", path_join(real_directory(collection), document), file=sys.stderr)
    # print("tagger_token", tagger_token, file=sys.stderr)
    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:
        # print("ann_obj", document, file=sys.stderr)

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
            # print("HTTPConnection", HTTPConnection, file=sys.stderr)
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + ('?' + url_soup.query
                                           if url_soup.query else '')
            try:
                # Note: Trout slapping for anyone sending Unicode objects here

                data = str(path_join(
                    real_directory(collection),
                    document)) + "#*^$#" + ann_obj.get_document_text()
                data = data.encode('utf-8')
                # print("data", type(data),data, file=sys.stderr)
                # print("data", ann_obj, file=sys.stderr)
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request(
                    'POST',
                    # As per: http://bugs.python.org/issue11898
                    # Force the url to be an ascii string
                    str(service_url),
                    data,
                    headers=req_headers)
                # httpConnection = http.client.HTTPConnection(url_soup.netloc)
                # httpConnection.request('GET', str(service_url), headers=req_headers)
                # response = httpConnection.getresponse()

            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()
            # print("resp-------------", resp.read(), file=sys.stderr)

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' % (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
            # print("json_resp", json_resp, file=sys.stderr)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        # print("json_resp.items:::::::::::::", json_resp.items(), file=sys.stderr)
        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            # print("json_resp.items:::::::::::::", offsets, file=sys.stderr)
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets
            ), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]
            # print("offsets, _type, texts, text:", offsets, _type, texts, text, file=sys.stderr)
            _id = ann_obj.get_new_id('T')
            print("_id", _id, file=sys.stderr)
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(offsets, _id, _type, text,
                                             " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values() if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Пример #44
0
def get_linker_config(directory):
    return ProjectConfiguration(directory).get_linker_config()
Пример #45
0
def create_arc(collection, document, origin, target, type,
        old_type=None, old_target=None):
    directory = collection

    real_dir = real_directory(directory)
    mods = ModificationTracker()

    real_dir = real_directory(directory)
    projectconf = ProjectConfiguration(real_dir)

    document = path_join(real_dir, document)

    with TextAnnotations(document) as ann_obj:
        # bail as quick as possible if read-only 
        # TODO: make consistent across the different editing
        # functions, integrate ann_obj initialization and checks
        if ann_obj._read_only:
            raise AnnotationsIsReadOnlyError(ann_obj.get_document())

        origin = ann_obj.get_ann_by_id(origin) 
        target = ann_obj.get_ann_by_id(target)

        if projectconf.is_equiv_type(type):
            # It is an Equiv
            if projectconf.is_equiv_type(old_type):
                # "Change" from Equiv to Equiv is harmless
                # TODO: some message needed?
                pass
            else:
                assert old_type is None, 'attempting to change equiv relation to non-equiv relation, operation not supported'
                ann = EquivAnnotation(type, [unicode(origin.id), unicode(target.id)], '')
                ann_obj.add_annotation(ann)
                mods.addition(ann)
        elif projectconf.is_relation_type(type):
            if old_type is not None or old_target is not None:
                assert type in projectconf.get_relation_types(), (
                        ('attempting to convert relation to non-relation "%s" ' % (target.type, )) +
                        ('(legit types: %s)' % (unicode(projectconf.get_relation_types()), )))

                sought_target = (old_target
                        if old_target is not None else target.id)
                sought_type = (old_type
                        if old_type is not None else type)

                # We are to change the type and/or target
                found = None
                for ann in ann_obj.get_relations():
                    if ann.arg2 == sought_target and ann.type == sought_type:
                        found = ann
                        break

                # Did it exist and is changed?, otherwise we do nothing
                if found is not None and (found.arg2 != target.id
                        or found.type != type):
                    before = unicode(found)
                    found.arg2 = target.id
                    found.type = type
                    mods.change(before, found)
            else:
                # Create a new annotation

                # TODO: Assign a suitable letter
                new_id = ann_obj.get_new_id('R')
                rel = projectconf.get_relation_by_type(type)
                assert rel is not None and len(rel.arg_list) == 2
                a1l, a2l = rel.arg_list
                ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t')
                mods.addition(ann)
                ann_obj.add_annotation(ann)
        else:
            try:
                arg_tup = (type, unicode(target.id))

                # Is this an addition or an update?
                if old_type is None and old_target is None:
                    if arg_tup not in origin.args:
                        before = unicode(origin)
                        origin.add_argument(type, unicode(target.id))
                        mods.change(before, origin)
                    else:
                        # It already existed as an arg, we were called to do nothing...
                        pass
                else:
                    # Construct how the old arg would have looked like
                    old_arg_tup = (type if old_type is None else old_type,
                            target if old_target is None else old_target)

                    if old_arg_tup in origin.args and arg_tup not in origin.args:
                        before = unicode(origin)
                        origin.args.remove(old_arg_tup)
                        origin.add_argument(type, unicode(target.id))
                        mods.change(before, origin)
                    else:
                        # Collision etc. don't do anything
                        pass
            except AttributeError:
                # The annotation did not have args, it was most likely an entity
                # thus we need to create a new Event...
                new_id = ann_obj.get_new_id('E')
                ann = EventAnnotation(
                            origin.id,
                            [arg_tup],
                            new_id,
                            origin.type,
                            ''
                            )
                ann_obj.add_annotation(ann)
                mods.addition(ann)

        mods_json = mods.json_response()
        mods_json['annotations'] = _json_from_ann(ann_obj)
        return mods_json
Пример #46
0
def tag(collection, document, tagger):
    pconf = ProjectConfiguration(real_directory(collection))
    for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
        if tagger == tagger_token:
            break
    else:
        raise UnknownTaggerError(tagger)

    path_join(real_directory(collection), document)

    with TextAnnotations(path_join(real_directory(collection),
                                   document)) as ann_obj:

        url_soup = urlparse(tagger_service_url)

        if url_soup.scheme == 'http':
            Connection = HTTPConnection
        elif url_soup.scheme == 'https':
            # Delayed HTTPS import since it relies on SSL which is commonly
            #   missing if you roll your own Python, for once we should not
            #   fail early since tagging is currently an edge case and we
            #   can't allow it to bring down the whole server.
            from http.client import HTTPSConnection
            Connection = HTTPSConnection
        else:
            raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)

        conn = None
        try:
            conn = Connection(url_soup.netloc)
            req_headers = {
                'Content-type': 'text/plain; charset=utf-8',
                'Accept': 'application/json',
            }
            # Build a new service URL since the request method doesn't accept
            #   a parameters argument
            service_url = url_soup.path + (
                '?' + url_soup.query if url_soup.query else '')
            try:
                data = ann_obj.get_document_text().encode('utf-8')
                req_headers['Content-length'] = len(data)
                # Note: Trout slapping for anyone sending Unicode objects here
                conn.request('POST',
                             # As per: http://bugs.python.org/issue11898
                             # Force the url to be an ascii string
                             str(service_url),
                             data,
                             headers=req_headers)
            except SocketError as e:
                raise TaggerConnectionError(tagger_token, e)
            resp = conn.getresponse()

            # Did the request succeed?
            if resp.status != 200:
                raise TaggerConnectionError(
                    tagger_token, '%s %s' %
                    (resp.status, resp.reason))
            # Finally, we can read the response data
            resp_data = resp.read()
        finally:
            if conn is not None:
                conn.close()

        try:
            json_resp = loads(resp_data)
        except ValueError:
            raise InvalidTaggerResponseError(tagger_token, resp_data)

        mods = ModificationTracker()
        cidmap = {}

        for cid, ann in ((i, a) for i, a in json_resp.items()
                         if _is_textbound(a)):
            assert 'offsets' in ann, 'Tagger response lacks offsets'
            offsets = ann['offsets']
            assert 'type' in ann, 'Tagger response lacks type'
            _type = ann['type']
            assert 'texts' in ann, 'Tagger response lacks texts'
            texts = ann['texts']

            # sanity
            assert len(offsets) != 0, 'Tagger response has empty offsets'
            assert len(texts) == len(
                offsets), 'Tagger response has different numbers of offsets and texts'

            start, end = offsets[0]
            text = texts[0]

            _id = ann_obj.get_new_id('T')
            cidmap[cid] = _id

            tb = TextBoundAnnotationWithText(
                offsets, _id, _type, text, " " + ' '.join(texts[1:]))

            mods.addition(tb)
            ann_obj.add_annotation(tb)

        for norm in (a for a in json_resp.values()
                     if _is_normalization(a)):
            try:
                _type = norm['type']
                target = norm['target']
                refdb = norm['refdb']
                refid = norm['refid']
            except KeyError as e:
                raise  # TODO

            _id = ann_obj.get_new_id('N')
            target = cidmap[target]

            na = NormalizationAnnotation(_id, _type, target, refdb, refid, '')

            mods.addition(na)
            ann_obj.add_annotation(na)

        mod_resp = mods.json_response()
        mod_resp['annotations'] = _json_from_ann(ann_obj)
        return mod_resp
Пример #47
0
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append(
                            [tb_count, rel_count, event_count, issue_count])
            except Exception, e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        # Cache the statistics
        try:
Пример #48
0
def get_statistics(directory, base_names, use_cache=True):
    # Check if we have a cache of the costly satistics generation
    # Also, only use it if no file is newer than the cache itself
    cache_file_path = get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as e:
        if e.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if (not isfile(cache_file_path)
                # Has config.py been changed?
                or getmtime(get_config_py_path()) > cache_mtime
                # Any file has changed in the dir since the cache was generated
                or any(True for f in listdir(directory)
                       if (getmtime(path_join(directory, f)) > cache_mtime
                           # Ignore hidden files
                           and not f.startswith('.')))
                # The configuration is newer than the cache
                or getmtime(get_config_path(directory)) > cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        # Generate the document statistics from scratch
        from .annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except BaseException:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append(
                            [tb_count, rel_count, event_count, issue_count])
            except Exception as e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        # Cache the statistics
        try:
            with open(cache_file_path, 'wb') as cache_file:
                pickle_dump(docstats, cache_file)
        except IOError as e:
            Messager.warning(
                "Could not write statistics cache file to directory %s: %s" %
                (directory, e))

    return stat_types, docstats