Пример #1
0
def calc_stats(sys_laf, ref_dir, ltf_dir):
    """Return hits, false alarms, and misses for system output2 LAF relative
    to reference LAF located in ref_dir.
    
    Inputs
    ------
    sys_laf : str
        LAF file containing system output2.

    ref_dir : str
        Directory containing reference LAF files.

    ltf_dir : str
        Directory containing LTF files.
    """
    # Check that LTF and system and reference LAF are valid.
    sys_doc = load_doc(sys_laf, LAFDocument, logger)
    bn = os.path.basename(sys_laf)
    ref_laf = os.path.join(ref_dir, bn)
    ref_doc = load_doc(ref_laf, LAFDocument, logger)
    ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
    ltf_doc = load_doc(ltf, LTFDocument, logger)
    if not all([ref_laf, sys_laf, ltf_doc]):
        return 0.0, 0.0, 0.0

    # Calculate hits, misses, and false alarms.
    try:
        tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized()

        # Convert mentions to (token_onset, token_offset, tag) format.
        sys_mentions = sys_doc.mentions()
        if len(sys_mentions) > 0:
            sys_ids, sys_tags, sys_extents, sys_char_onsets, sys_char_offsets = zip(*sys_mentions)
            sys_mention_onsets, sys_mention_offsets = convert_extents(sys_char_onsets, sys_char_offsets,
                                                                      token_onsets, token_offsets)
            sys_mentions = zip(sys_tags, sys_mention_onsets, sys_mention_offsets)
            sys_mentions = set(map(tuple, sys_mentions))
        else:
            sys_mentions = set()

        ref_mentions = ref_doc.mentions()
        if len(ref_mentions) > 0:
            ref_ids, ref_tags, ref_extents, ref_char_onsets, ref_char_offsets = zip(*ref_mentions)
            ref_mention_onsets, ref_mention_offsets = convert_extents(ref_char_onsets, ref_char_offsets,
                                                                      token_onsets, token_offsets)
            ref_mentions = zip(ref_tags, ref_mention_onsets, ref_mention_offsets)
            ref_mentions = set(map(tuple, ref_mentions))
        else:
            ref_mentions = set()

        # Calculate.
        n_hit = len(sys_mentions & ref_mentions)
        n_fa = len(sys_mentions - ref_mentions)
        n_miss = len(ref_mentions - sys_mentions)
    except:
        logger.warn('Scoring failed for %s. Skipping.' % ref_laf)
        n_hit = n_fa = n_miss

    return n_hit, n_fa, n_miss
Пример #2
0
def convert_mentions(mentions, token_onsets, token_offsets):
    """Convert mentions format to set of tuples of form 
    (start_token, end_token, tag).

    Inputs
    ------
    mentions : sequence
        Sequence of mention tuples. For format, see LTFDocument.mentions.

    token_onsets : sequence of int
        Sequence of character onsets of tokens.

    token_offsets : sequence of int
        Sequence of character offsets of tokens.

    Outputs
    -------
    mentions_ : list of tuple
        List of mentions in DIFFERENT format from that of mentions. This
        format consists of a tuple of (TAG, TOKEN_ONSET, TOKEN_OFFSET),
        where TOKEN_ONSET and TOKEN_OFFSET are the token indices.
    """
    # Extract sys/ref mentions and represent as tuples of
    #
    #     (start_token, end_token, tag)
    if len(mentions) > 0:
        mention_ids, tags, char_onsets, char_offsets = zip(*mentions)
        mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                          token_onsets, token_offsets)
        mentions_ = zip(tags, mention_onsets, mention_offsets)
        mentions_ = set(map(tuple, mentions_))
    else:
        mentions_ = set()

    return mentions_
Пример #3
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:

        A_vals = set()
        B_vals = set()
        G_vals = set()
        ltfs = []

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            ltfs.append(ltf)

        A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger)

        #            laf_doc = load_doc(laf, LAFDocument, logger);
        #            ltf_doc = load_doc(ltf, LTFDocument, logger);
        #            if laf_doc is None or ltf_doc is None:
        #                continue;

        # Extract features/targets.
        #            try:
        # Extract tokens.
        #                try:
        #                    tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG();
        #                except:
        #                    tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
        #                    token_As = token_Bs = token_Gs = None;
        #                if token_As != None:
        #                    A_vals.update(token_As)
        #                if token_Bs != None:
        #                    B_vals.update(token_Bs)
        #                if token_Gs != None:
        #                    G_vals.update(token_Gs)
        #            except:
        #                logger.warn('ABG values not found for %s. Skipping.' % laf);
        #                continue;

        print(
            "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n"
            .format(len(A_vals), len(B_vals), len(G_vals)))

        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf)
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'))
            laf_doc = load_doc(laf, LAFDocument, logger)
            ltf_doc = load_doc(ltf, LTFDocument, logger)
            if laf_doc is None or ltf_doc is None:
                continue

            # Extract features/targets.
            try:
                # Extract tokens.
                try:
                    tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG(
                    )
                except:
                    tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized(
                    )
                    token_As = token_Bs = token_Gs = token_Fs = token_Js = None

                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                if len(mentions) == 0:
                    mentions_ = []
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(
                        *mentions)
                    mention_onsets, mention_offsets = convert_extents(
                        char_onsets, char_offsets, token_onsets, token_offsets)
                    mentions_ = list(zip(tags, mention_onsets,
                                         mention_offsets))

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_)
                prev_mention_offset = -1
                temp_mentions_ = []
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append(
                            [tag, mention_onset, mention_offset])
                    prev_mention_offset = mention_offset
                mentions_ = temp_mentions_

                feats, targets = enc.get_feats_targets(tokens, mentions_,
                                                       token_nums, token_As,
                                                       token_Bs, token_Gs,
                                                       token_Fs, token_Js,
                                                       A_vals, B_vals, G_vals)

            except:
                logger.warn('Feature extraction failed for %s. Skipping.' %
                            laf)
                continue

            # Write to file.
            write_crfsuite_file(f, feats, targets)
Пример #4
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf);
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml'));
            laf_doc = load_doc(laf, LAFDocument, logger);
            ltf_doc = load_doc(ltf, LTFDocument, logger);
            if laf_doc is None or ltf_doc is None:
                continue;
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized();
                
                # Convert mentions to format expected by the encoder; that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions();
                if len(mentions) == 0:
                    mentions_ = [];
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions);
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets);
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets));

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                sort_mentions(mentions_);
                prev_mention_offset = -1;
                temp_mentions_ = [];
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]);
                    prev_mention_offset = mention_offset;
                mentions_ = temp_mentions_;

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_);
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf);
                continue;

            # Write to file.
            write_crfsuite_file(f, feats, targets);
Пример #5
0
def write_train_data(lafs, ltf_dir, enc, trainf):
    """Extract features and target labels for each LTF/LAF pair and write to
    disk in CRFSuite data format.

    For details regarding this format, consult

    http://www.chokkan.org/software/crfsuite/manual.html

    Inputs
    ------
    lafs: list of str
        Paths to LAF files.

    ltf_dir : str
        Directory to search for LTF files.

    enc : features.Encoder
        Feature encoder.

    trainf : str
        CRFsuite training file.
    """
    with open(trainf, 'w') as f:
        for laf in lafs:
            # Check that the LTF and LAF are valid.
            bn = os.path.basename(laf) 
            ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) 
            laf_doc = load_doc(laf, LAFDocument, logger) 
            ltf_doc = load_doc(ltf, LTFDocument, logger) 
            if laf_doc is None or ltf_doc is None:
                continue 
            
            # Extract features/targets.
            try:
                # Extract tokens.
                tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() 
                #print len(tokens)
                # Convert mentions to format expected by the encoder  that is,
                # (tag, token_onset, token_offset).
                mentions = laf_doc.mentions()
                #print mentions
                if len(mentions) == 0:
                    mentions_ = [] 
                else:
                    # Map to the minimal enclosing span of tokens in the
                    # supplied LTF.
                    entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions)
                    # print token_onsets
                    # print char_onsets
                    # print char_onsets
                    mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets,
                                                                      token_onsets, token_offsets)
                    #print mention_onsets
                    mentions_ = list(zip(tags, mention_onsets, mention_offsets)) 

                # Eliminate overlapping mentions, retaining whichever
                # is first when sorted in ascending order by (onset, offset).
                #print mentions_
                sort_mentions(mentions_) 
                prev_mention_offset = -1 
                temp_mentions_ = [] 
                for tag, mention_onset, mention_offset in mentions_:
                    if mention_onset > prev_mention_offset:
                        temp_mentions_.append([tag, mention_onset, mention_offset]) 
                    prev_mention_offset = mention_offset 
                mentions_ = temp_mentions_
                # print 'mentions:'
                #print mentions_
                #print tokens

                # Extract features/targets and write to file in CRFSuite
                # format.
                feats, targets = enc.get_feats_targets(tokens, mentions_)
                #print 'feats: \n'
                #print feats
                #print 'targets:'
                #print targets
            except KeyError:
                logger.warn('Feature extraction failed for %s. Skipping.' % laf) 
                continue 

            # Write to file.
            write_crfsuite_file(f, feats, targets)