Пример #1
0
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError, e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])
Пример #2
0
def _enrich_json_with_data(j_dic, ann_obj):
    # TODO: figure out if there's a reason for all the unicode()
    # invocations here; remove if not.

    # We collect trigger ids to be able to link the textbound later on
    trigger_ids = set()
    for event_ann in ann_obj.get_events():
        trigger_ids.add(event_ann.trigger)
        j_dic['events'].append([
            unicode(event_ann.id),
            unicode(event_ann.trigger), event_ann.args
        ])

    for rel_ann in ann_obj.get_relations():
        j_dic['relations'].append([
            unicode(rel_ann.id),
            unicode(rel_ann.type),
            [(rel_ann.arg1l, rel_ann.arg1), (rel_ann.arg2l, rel_ann.arg2)]
        ])

    for tb_ann in ann_obj.get_textbounds():
        #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end]
        j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans]

        # If we spotted it in the previous pass as a trigger for an
        # event or if the type is known to be an event type, we add it
        # as a json trigger.
        # TODO: proper handling of disconnected triggers. Currently
        # these will be erroneously passed as 'entities'
        if unicode(tb_ann.id) in trigger_ids:
            j_dic['triggers'].append(j_tb)
            # special case for BioNLP ST 2013 format: send triggers
            # also as entities for those triggers that are referenced
            # from annotations other than events (#926).
            if BIONLP_ST_2013_COMPATIBILITY:
                if tb_ann.id in ann_obj.externally_referenced_triggers:
                    try:
                        j_dic['entities'].append(j_tb)
                    except KeyError:
                        j_dic['entities'] = [
                            j_tb,
                        ]
        else:
            try:
                j_dic['entities'].append(j_tb)
            except KeyError:
                j_dic['entities'] = [
                    j_tb,
                ]

    for eq_ann in ann_obj.get_equivs():
        j_dic['equivs'].append(
            (['*', eq_ann.type] + [e for e in eq_ann.entities]))

    for att_ann in ann_obj.get_attributes():
        j_dic['attributes'].append([
            unicode(att_ann.id),
            unicode(att_ann.type),
            unicode(att_ann.target), att_ann.value
        ])

    for norm_ann in ann_obj.get_normalizations():
        j_dic['normalizations'].append([
            unicode(norm_ann.id),
            unicode(norm_ann.type),
            unicode(norm_ann.target),
            unicode(norm_ann.refdb),
            unicode(norm_ann.refid),
            unicode(norm_ann.reftext)
        ])

    for com_ann in ann_obj.get_oneline_comments():
        comment = [
            unicode(com_ann.target),
            unicode(com_ann.type),
            com_ann.tail.strip()
        ]
        try:
            j_dic['comments'].append(comment)
        except KeyError:
            j_dic['comments'] = [
                comment,
            ]

    if ann_obj.failed_lines:
        error_msg = 'Unable to parse the following line(s):\n%s' % (
            '\n'.join([
                (
                    '%s: %s' % (
                        # The line number is off by one
                        unicode(line_num + 1),
                        unicode(ann_obj[line_num]))).strip()
                for line_num in ann_obj.failed_lines
            ]))
        Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3)

    j_dic['mtime'] = ann_obj.ann_mtime
    j_dic['ctime'] = ann_obj.ann_ctime

    try:
        # XXX avoid digging the directory from the ann_obj
        import os
        docdir = os.path.dirname(ann_obj._document)
        if options_get_validation(docdir) in (
                'all',
                'full',
        ):
            from verify_annotations import verify_annotation
            projectconf = ProjectConfiguration(docdir)
            issues = verify_annotation(ann_obj, projectconf)
        else:
            issues = []
    except Exception, e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: verify_annotation() failed: %s' % e, -1)
Пример #3
0
def _enrich_json_with_data(j_dic, ann_obj):
    # TODO: figure out if there's a reason for all the unicode()
    # invocations here; remove if not.

    # We collect trigger ids to be able to link the textbound later on
    trigger_ids = set()
    for event_ann in ann_obj.get_events():
        trigger_ids.add(event_ann.trigger)
        j_dic['events'].append(
                [unicode(event_ann.id), unicode(event_ann.trigger), event_ann.args]
                )

    for rel_ann in ann_obj.get_relations():
        j_dic['relations'].append(
            [unicode(rel_ann.id), unicode(rel_ann.type), 
             [(rel_ann.arg1l, rel_ann.arg1),
              (rel_ann.arg2l, rel_ann.arg2)]]
            )

    for tb_ann in ann_obj.get_textbounds():
        #j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.start, tb_ann.end]
        j_tb = [unicode(tb_ann.id), tb_ann.type, tb_ann.spans]

        # If we spotted it in the previous pass as a trigger for an
        # event or if the type is known to be an event type, we add it
        # as a json trigger.
        # TODO: proper handling of disconnected triggers. Currently
        # these will be erroneously passed as 'entities'
        if unicode(tb_ann.id) in trigger_ids:
            j_dic['triggers'].append(j_tb)
            # special case for BioNLP ST 2013 format: send triggers
            # also as entities for those triggers that are referenced
            # from annotations other than events (#926).
            if BIONLP_ST_2013_COMPATIBILITY:
                if tb_ann.id in ann_obj.externally_referenced_triggers:
                    try:
                        j_dic['entities'].append(j_tb)
                    except KeyError:
                        j_dic['entities'] = [j_tb, ]
        else: 
            try:
                j_dic['entities'].append(j_tb)
            except KeyError:
                j_dic['entities'] = [j_tb, ]


    for eq_ann in ann_obj.get_equivs():
        j_dic['equivs'].append(
                (['*', eq_ann.type]
                    + [e for e in eq_ann.entities])
                )

    for att_ann in ann_obj.get_attributes():
        j_dic['attributes'].append(
                [unicode(att_ann.id), unicode(att_ann.type), unicode(att_ann.target), att_ann.value]
                )

    for norm_ann in ann_obj.get_normalizations():
        j_dic['normalizations'].append(
                [unicode(norm_ann.id), unicode(norm_ann.type), 
                 unicode(norm_ann.target), unicode(norm_ann.refdb), 
                 unicode(norm_ann.refid), unicode(norm_ann.reftext)]
                )

    for com_ann in ann_obj.get_oneline_comments():
        comment = [unicode(com_ann.target), unicode(com_ann.type),
                com_ann.tail.strip()]
        try:
            j_dic['comments'].append(comment)
        except KeyError:
            j_dic['comments'] = [comment, ]

    if ann_obj.failed_lines:
        error_msg = 'Unable to parse the following line(s):\n%s' % (
                '\n'.join(
                [('%s: %s' % (
                            # The line number is off by one
                            unicode(line_num + 1),
                            unicode(ann_obj[line_num])
                            )).strip()
                 for line_num in ann_obj.failed_lines])
                )
        Messager.error(error_msg, duration=len(ann_obj.failed_lines) * 3)

    j_dic['mtime'] = ann_obj.ann_mtime
    j_dic['ctime'] = ann_obj.ann_ctime

    try:
        # XXX avoid digging the directory from the ann_obj
        import os
        docdir = os.path.dirname(ann_obj._document)
        if options_get_validation(docdir) in ('all', 'full', ):
            from verify_annotations import verify_annotation
            projectconf = ProjectConfiguration(docdir)
            issues = verify_annotation(ann_obj, projectconf)
        else:
            issues = []
    except Exception, e:
        # TODO add an issue about the failure?
        issues = []
        Messager.error('Error: verify_annotation() failed: %s' % e, -1)
Пример #4
0
def get_statistics(directory, base_names, use_cache=True):
    # Check if we have a cache of the costly satistics generation
    # Also, only use it if no file is newer than the cache itself
    cache_file_path = get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as e:
        if e.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if (not isfile(cache_file_path)
                # Has config.py been changed?
                or getmtime(get_config_py_path()) > cache_mtime
                # Any file has changed in the dir since the cache was generated
                or any(True for f in listdir(directory)
                       if (getmtime(path_join(directory, f)) > cache_mtime
                           # Ignore hidden files
                           and not f.startswith('.')))
                # The configuration is newer than the cache
                or getmtime(get_config_path(directory)) > cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating'
        )
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("实体", "int"), ("关系", "int"), ("事件", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("观点", "int"))

    stat_types.append(("修改者", "string"))
    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                # 在这里获取实体,关系,事件,修改者。
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    try:
                        user = get_session().get('user')
                    except KeyError:
                        user = None

                    if user is None:
                        user = '******'

                    if options_get_validation(directory) == 'none':
                        docstats.append(
                            [tb_count, rel_count, event_count, user])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except BaseException:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append([
                            tb_count, rel_count, event_count, issue_count, user
                        ])
            except Exception as e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        try:
            user = get_session().get('user')
        except KeyError:
            user = None
        if user is None:
            user = '******'
        # Cache the statistics
        try:
            with open(cache_file_path, 'wb') as cache_file:
                pickle_dump(docstats, cache_file)
        except IOError as e:
            Messager.warning(
                "Could not write statistics cache file to directory %s: %s" %
                (directory, e))

    return stat_types, docstats
Пример #5
0
    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname), 
                        read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])


                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append([tb_count, rel_count, event_count, issue_count])
            except Exception, e:
                log_info('Received "%s" when trying to generate stats' % e)
Пример #6
0
def get_statistics(directory, base_names, use_cache=True):
    # Check if we have a cache of the costly satistics generation
    # Also, only use it if no file is newer than the cache itself
    cache_file_path = get_stat_cache_by_dir(directory)

    try:
        cache_mtime = getmtime(cache_file_path)
    except OSError as e:
        if e.errno == 2:
            cache_mtime = -1
        else:
            raise

    try:
        if (not isfile(cache_file_path)
                # Has config.py been changed?
                or getmtime(get_config_py_path()) > cache_mtime
                # Any file has changed in the dir since the cache was generated
                or any(True for f in listdir(directory)
                       if (getmtime(path_join(directory, f)) > cache_mtime
                           # Ignore hidden files
                           and not f.startswith('.')))
                # The configuration is newer than the cache
                or getmtime(get_config_path(directory)) > cache_mtime):
            generate = True
            docstats = []
        else:
            generate = False
            try:
                with open(cache_file_path, 'rb') as cache_file:
                    docstats = pickle_load(cache_file)
                if len(docstats) != len(base_names):
                    Messager.warning(
                        'Stats cache %s was incomplete; regenerating' %
                        cache_file_path)
                    generate = True
                    docstats = []
            except UnpicklingError:
                # Corrupt data, re-generate
                Messager.warning(
                    'Stats cache %s was corrupted; regenerating' %
                    cache_file_path, -1)
                generate = True
            except EOFError:
                # Corrupt data, re-generate
                generate = True
    except OSError as e:
        Messager.warning(
            'Failed checking file modification times for stats cache check; regenerating')
        generate = True

    if not use_cache:
        generate = True

    # "header" and types
    stat_types = [("Entities", "int"), ("Relations", "int"), ("Events", "int")]

    if options_get_validation(directory) != 'none':
        stat_types.append(("Issues", "int"))

    if generate:
        # Generate the document statistics from scratch
        from annotation import JOINED_ANN_FILE_SUFF
        log_info('generating statistics for "%s"' % directory)
        docstats = []
        for docname in base_names:
            try:
                with Annotations(path_join(directory, docname),
                                 read_only=True) as ann_obj:
                    tb_count = len([a for a in ann_obj.get_entities()])
                    rel_count = (len([a for a in ann_obj.get_relations()]) +
                                 len([a for a in ann_obj.get_equivs()]))
                    event_count = len([a for a in ann_obj.get_events()])

                    if options_get_validation(directory) == 'none':
                        docstats.append([tb_count, rel_count, event_count])
                    else:
                        # verify and include verification issue count
                        try:
                            from projectconfig import ProjectConfiguration
                            projectconf = ProjectConfiguration(directory)
                            from verify_annotations import verify_annotation
                            issues = verify_annotation(ann_obj, projectconf)
                            issue_count = len(issues)
                        except BaseException:
                            # TODO: error reporting
                            issue_count = -1
                        docstats.append(
                            [tb_count, rel_count, event_count, issue_count])
            except Exception as e:
                log_info('Received "%s" when trying to generate stats' % e)
                # Pass exceptions silently, just marking stats missing
                docstats.append([-1] * len(stat_types))

        # Cache the statistics
        try:
            with open(cache_file_path, 'wb') as cache_file:
                pickle_dump(docstats, cache_file)
        except IOError as e:
            Messager.warning(
                "Could not write statistics cache file to directory %s: %s" %
                (directory, e))

    return stat_types, docstats