def calc_stats(sys_laf, ref_dir, ltf_dir): """Return hits, false alarms, and misses for system output2 LAF relative to reference LAF located in ref_dir. Inputs ------ sys_laf : str LAF file containing system output2. ref_dir : str Directory containing reference LAF files. ltf_dir : str Directory containing LTF files. """ # Check that LTF and system and reference LAF are valid. sys_doc = load_doc(sys_laf, LAFDocument, logger) bn = os.path.basename(sys_laf) ref_laf = os.path.join(ref_dir, bn) ref_doc = load_doc(ref_laf, LAFDocument, logger) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) ltf_doc = load_doc(ltf, LTFDocument, logger) if not all([ref_laf, sys_laf, ltf_doc]): return 0.0, 0.0, 0.0 # Calculate hits, misses, and false alarms. try: tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() # Convert mentions to (token_onset, token_offset, tag) format. sys_mentions = sys_doc.mentions() if len(sys_mentions) > 0: sys_ids, sys_tags, sys_extents, sys_char_onsets, sys_char_offsets = zip(*sys_mentions) sys_mention_onsets, sys_mention_offsets = convert_extents(sys_char_onsets, sys_char_offsets, token_onsets, token_offsets) sys_mentions = zip(sys_tags, sys_mention_onsets, sys_mention_offsets) sys_mentions = set(map(tuple, sys_mentions)) else: sys_mentions = set() ref_mentions = ref_doc.mentions() if len(ref_mentions) > 0: ref_ids, ref_tags, ref_extents, ref_char_onsets, ref_char_offsets = zip(*ref_mentions) ref_mention_onsets, ref_mention_offsets = convert_extents(ref_char_onsets, ref_char_offsets, token_onsets, token_offsets) ref_mentions = zip(ref_tags, ref_mention_onsets, ref_mention_offsets) ref_mentions = set(map(tuple, ref_mentions)) else: ref_mentions = set() # Calculate. n_hit = len(sys_mentions & ref_mentions) n_fa = len(sys_mentions - ref_mentions) n_miss = len(ref_mentions - sys_mentions) except: logger.warn('Scoring failed for %s. Skipping.' % ref_laf) n_hit = n_fa = n_miss return n_hit, n_fa, n_miss
def convert_mentions(mentions, token_onsets, token_offsets): """Convert mentions format to set of tuples of form (start_token, end_token, tag). Inputs ------ mentions : sequence Sequence of mention tuples. For format, see LTFDocument.mentions. token_onsets : sequence of int Sequence of character onsets of tokens. token_offsets : sequence of int Sequence of character offsets of tokens. Outputs ------- mentions_ : list of tuple List of mentions in DIFFERENT format from that of mentions. This format consists of a tuple of (TAG, TOKEN_ONSET, TOKEN_OFFSET), where TOKEN_ONSET and TOKEN_OFFSET are the token indices. """ # Extract sys/ref mentions and represent as tuples of # # (start_token, end_token, tag) if len(mentions) > 0: mention_ids, tags, char_onsets, char_offsets = zip(*mentions) mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets) mentions_ = zip(tags, mention_onsets, mention_offsets) mentions_ = set(map(tuple, mentions_)) else: mentions_ = set() return mentions_
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: A_vals = set() B_vals = set() G_vals = set() ltfs = [] for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) ltfs.append(ltf) A_vals, B_vals, G_vals = get_ABG_value_sets(ltfs, logger) # laf_doc = load_doc(laf, LAFDocument, logger); # ltf_doc = load_doc(ltf, LTFDocument, logger); # if laf_doc is None or ltf_doc is None: # continue; # Extract features/targets. # try: # Extract tokens. # try: # tokens, token_ids, token_onsets, token_offsets, token_As, token_Bs, token_Gs = ltf_doc.tokenizedWithABG(); # except: # tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # token_As = token_Bs = token_Gs = None; # if token_As != None: # A_vals.update(token_As) # if token_Bs != None: # B_vals.update(token_Bs) # if token_Gs != None: # G_vals.update(token_Gs) # except: # logger.warn('ABG values not found for %s. Skipping.' % laf); # continue; print( "Found the following number of values for ABG:\nA: {}\nB: {}\nG: {}\n" .format(len(A_vals), len(B_vals), len(G_vals))) for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. try: tokens, token_ids, token_onsets, token_offsets, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js = ltf_doc.tokenizedWithABG( ) except: tokens, token_ids, token_onsets, token_offsets, token_nums = ltf_doc.tokenized( ) token_As = token_Bs = token_Gs = token_Fs = token_Js = None # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip( *mentions) mention_onsets, mention_offsets = convert_extents( char_onsets, char_offsets, token_onsets, token_offsets) mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append( [tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ feats, targets = enc.get_feats_targets(tokens, mentions_, token_nums, token_As, token_Bs, token_Gs, token_Fs, token_Js, A_vals, B_vals, G_vals) except: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf); ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')); laf_doc = load_doc(laf, LAFDocument, logger); ltf_doc = load_doc(ltf, LTFDocument, logger); if laf_doc is None or ltf_doc is None: continue; # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized(); # Convert mentions to format expected by the encoder; that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions(); if len(mentions) == 0: mentions_ = []; else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions); mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets); mentions_ = list(zip(tags, mention_onsets, mention_offsets)); # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). sort_mentions(mentions_); prev_mention_offset = -1; temp_mentions_ = []; for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]); prev_mention_offset = mention_offset; mentions_ = temp_mentions_; # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_); except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf); continue; # Write to file. write_crfsuite_file(f, feats, targets);
def write_train_data(lafs, ltf_dir, enc, trainf): """Extract features and target labels for each LTF/LAF pair and write to disk in CRFSuite data format. For details regarding this format, consult http://www.chokkan.org/software/crfsuite/manual.html Inputs ------ lafs: list of str Paths to LAF files. ltf_dir : str Directory to search for LTF files. enc : features.Encoder Feature encoder. trainf : str CRFsuite training file. """ with open(trainf, 'w') as f: for laf in lafs: # Check that the LTF and LAF are valid. bn = os.path.basename(laf) ltf = os.path.join(ltf_dir, bn.replace('.laf.xml', '.ltf.xml')) laf_doc = load_doc(laf, LAFDocument, logger) ltf_doc = load_doc(ltf, LTFDocument, logger) if laf_doc is None or ltf_doc is None: continue # Extract features/targets. try: # Extract tokens. tokens, token_ids, token_onsets, token_offsets = ltf_doc.tokenized() #print len(tokens) # Convert mentions to format expected by the encoder that is, # (tag, token_onset, token_offset). mentions = laf_doc.mentions() #print mentions if len(mentions) == 0: mentions_ = [] else: # Map to the minimal enclosing span of tokens in the # supplied LTF. entity_ids, tags, extents, char_onsets, char_offsets = zip(*mentions) # print token_onsets # print char_onsets # print char_onsets mention_onsets, mention_offsets = convert_extents(char_onsets, char_offsets, token_onsets, token_offsets) #print mention_onsets mentions_ = list(zip(tags, mention_onsets, mention_offsets)) # Eliminate overlapping mentions, retaining whichever # is first when sorted in ascending order by (onset, offset). #print mentions_ sort_mentions(mentions_) prev_mention_offset = -1 temp_mentions_ = [] for tag, mention_onset, mention_offset in mentions_: if mention_onset > prev_mention_offset: temp_mentions_.append([tag, mention_onset, mention_offset]) prev_mention_offset = mention_offset mentions_ = temp_mentions_ # print 'mentions:' #print mentions_ #print tokens # Extract features/targets and write to file in CRFSuite # format. feats, targets = enc.get_feats_targets(tokens, mentions_) #print 'feats: \n' #print feats #print 'targets:' #print targets except KeyError: logger.warn('Feature extraction failed for %s. Skipping.' % laf) continue # Write to file. write_crfsuite_file(f, feats, targets)