Пример #1
0
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append(
                            [tb_count, rel_count, event_count, issue_count])
            except Exception, e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        # Cache the statistics
        try:
Пример #2
0
def get_statistics(directory, base_names, use_cache=True):
    # Check if we have a cache of the costly satistics generation
    # Also, only use it if no file is newer than the cache itself
    cache_file_path = get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as e:
        if e.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if (not isfile(cache_file_path)
                # Has config.py been changed?
                or getmtime(get_config_py_path()) > cache_mtime
                # Any file has changed in the dir since the cache was generated
                or any(True for f in listdir(directory)
                       if (getmtime(path_join(directory, f)) > cache_mtime
                           # Ignore hidden files
                           and not f.startswith('.')))
                # The configuration is newer than the cache
                or getmtime(get_config_path(directory)) > cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except BaseException:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append(
                            [tb_count, rel_count, event_count, issue_count])
            except Exception as e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        # Cache the statistics
        try:
            with open(cache_file_path, 'wb') as cache_file:
                pickle_dump(docstats, cache_file)
        except IOError as e:
            Messager.warning(
                "Could not write statistics cache file to directory %s: %s" %
                (directory, e))

    return stat_types, docstats
Пример #3
0
def _enrich_json_with_data(j_dic, ann_obj):
    # TODO: figure out if there's a reason for all the str()
    # invocations here; remove if not.

    # We collect trigger ids to be able to link the textbound later on
    trigger_ids = set()
    for event_ann in ann_obj.get_events():
        trigger_ids.add(event_ann.trigger)
        j_dic['events'].append(
            [str(event_ann.id),
             str(event_ann.trigger), event_ann.args])

    for rel_ann in ann_obj.get_relations():
        j_dic['relations'].append([
            str(rel_ann.id),
            str(rel_ann.type),
            [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)]
        ])

    for tb_ann in ann_obj.get_textbounds():
        #j_tb = [str(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end]
        j_tb = [str(tb_ann.id), tb_ann.type, tb_ann.spans]

        # If we spotted it in the previous pass as a trigger for an
        # event or if the type is known to be an event type, we add it
        # as a json trigger.
        # TODO: proper handling of disconnected triggers. Currently
        # these will be erroneously passed as 'entities'
        if str(tb_ann.id) in trigger_ids:
            j_dic['triggers'].append(j_tb)
            # special case for BioNLP ST 2013 format: send triggers
            # also as entities for those triggers that are referenced
            # from annotations other than events (#926).
            if BIONLP_ST_2013_COMPATIBILITY:
                if tb_ann.id in ann_obj.externally_referenced_triggers:
                    try:
                        j_dic['entities'].append(j_tb)
                    except KeyError:
                        j_dic['entities'] = [
                            j_tb,
                        ]
        else:
            try:
                j_dic['entities'].append(j_tb)
            except KeyError:
                j_dic['entities'] = [
                    j_tb,
                ]

    for eq_ann in ann_obj.get_equivs():
        j_dic['equivs'].append(
            (['*', eq_ann.type] + [e for e in eq_ann.entities]))

    for att_ann in ann_obj.get_attributes():
        j_dic['attributes'].append([
            str(att_ann.id),
            str(att_ann.type),
            str(att_ann.target), att_ann.value
        ])

    for norm_ann in ann_obj.get_normalizations():
        j_dic['normalizations'].append([
            str(norm_ann.id),
            str(norm_ann.type),
            str(norm_ann.target),
            str(norm_ann.refdb),
            str(norm_ann.refid),
            str(norm_ann.reftext)
        ])

    for com_ann in ann_obj.get_oneline_comments():
        comment = [
            str(com_ann.target),
            str(com_ann.type),
            com_ann.tail.strip()
        ]
        try:
            j_dic['comments'].append(comment)
        except KeyError:
            j_dic['comments'] = [
                comment,
            ]

    if ann_obj.failed_lines:
        error_msg = 'Unable to parse the following line(s):\n%s' % (
            '\n'.join([
                (
                    '%s: %s' % (
                        # The line number is off by one
                        str(line_num + 1),
                        str(ann_obj[line_num]))).strip()
                for line_num in ann_obj.failed_lines
            ]))
        Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3)

    j_dic['mtime'] = ann_obj.ann_mtime
    j_dic['ctime'] = ann_obj.ann_ctime

    try:
        # XXX avoid digging the directory from the ann_obj
        import os
        docdir = os.path.dirname(ann_obj._document)
        if options_get_validation(docdir) in (
                'all',
                'full',
        ):
            from verify_annotations import verify_annotation
            projectconf = ProjectConfiguration(docdir)
            issues = verify_annotation(ann_obj, projectconf)
        else:
            issues = []
    except Exception as e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: verify_annotation() failed: %s' % e, -1)

    for i in issues:
        issue = (str(i.ann_id), i.type, i.description)
        try:
            j_dic['comments'].append(issue)
        except BaseException:
            j_dic['comments'] = [
                issue,
            ]

    # Attach the source files for the annotations and text
    from os.path import splitext
    from annotation import TEXT_FILE_SUFFIX
    ann_files = [splitext(p)[1][1:] for p in ann_obj._input_files]
    ann_files.append(TEXT_FILE_SUFFIX)
    ann_files = sorted([p for p in set(ann_files)])
    j_dic['source_files'] = ann_files
Пример #4
0
def get_annotator_config(directory):
    # TODO: "annotator" is a very confusing term for a web service
    # that does automatic annotation in the context of a tool
    # where most annotators are expected to be human. Rethink.
    return ProjectConfiguration(directory).get_annotator_config()
Пример #5
0
def get_normalization_config(directory):
    return ProjectConfiguration(directory).get_normalization_config()
Пример #6
0
def get_disambiguator_config(directory):
    return ProjectConfiguration(directory).get_disambiguator_config()
Пример #7
0
def get_search_config(directory):
    return ProjectConfiguration(directory).get_search_config()