def main(): """Demonstrate using the codeswitchador API.""" parser = argparse.ArgumentParser(description=main.__doc__) parser.add_argument('configfile', help='configuration file for the lidder') args = parser.parse_args() config = args.configfile # Model version, here's the latest model = MODEL1_5 # Labeling method of low confidence tokens lowmethod = LOW_METHOD_UNK # Labeling method for unknown tokens unkmethod = UNK_METHOD_LEFT lidder = default_lidder(model, config) tokens = ["Yo", "quiero", "go", "to", "the", "store", "en", "el", "centro"] result = label_tokens(tokens, lidder, model, lowmethod, unkmethod) print result
def annotate(model, inpath, outfile, show_ratio, lowmethod, unkmethod, informat, annotated, ignore_entities, n_folds, quiet=False): """Annotate tokens in the file.""" assert(informat == FORMAT_LOG or model in SUPPORTED_MODELS) verbose = False # Fire up a lidder if we're not just reading from log if informat != FORMAT_LOG: lidder = default_lidder(model, RATIOLIST_DEFAULT_CONFIG) # Force model1 to use MLE for the low method and UNK_METHOD to left, so we're sure no randomness # gets applied if model == MODEL1: lowmethod = LOW_METHOD_MLE unkmethod = UNK_METHOD_LEFT # Set the flag for whether we're evaluating only message LID, not token-by-token token_eval = not(informat == FORMAT_LOG or model == MODEL0) # Evaluators token_acc = Accuracy() # For when unk is allowed all_lid_acc = Accuracy() twoway_lid_acc = Accuracy() cs_perf = SDMetrics() # When unk is excluded nounk_all_lid_acc = Accuracy() nounk_twoway_lid_acc = Accuracy() nounk_cs_perf = SDMetrics() # Code switch points cs_boundaries = SDMetrics() if n_folds: train_paths, test_paths = kfolds(inpath, n_folds) infiles = [_open_infile(inpath, informat) for inpath in test_paths] else: infiles =[_open_infile(inpath, informat)] fold_accuracies = [] for infile in infiles: # To match the SVM_HMM evaluation, we have a special token accuracy that's reset every fold fold_token_acc = Accuracy() for tokens, tags, gold_langs, lid, gold_lid in _tokens_tags_langs(infile, informat, annotated): # Put in dummy tags if needed if not tags: tags = [JERBOA_NOTAG] * len(tokens) tokens_lower = [token.lower() for token in tokens] # We label all tokens only if it's annotated, as the annotations will later # wipe out anything we shouldn't have labeled # TODO: This is a little wonky as labeled bad tokens can affect the lid/cs # decision, but for model 1.0 this doesn't actually matter as they aren't # in the wordlist if informat == FORMAT_LOG: # Skip lines with no gold annotation if not gold_lid: continue # Don't label anything, just use what we got from the log file verdict = lid == MULTIPLE_LANGS elif model == MODEL0: lid, langspresent, hits, verdict = lidder.idlangs(tokens_lower) ratios = out_langs = unk_rate = None else: lid, langspresent, hits, ratios, out_langs, unk_rate, verdict = \ (lidder.idlangs(tokens_lower, lowmethod, unkmethod, tags) if model == MODEL1_5 else lidder.idlangs(tokens_lower)) output_lang = lid if not verdict else MULTIPLE_LANGS # Token labeling if token_eval: # For model 1.0, apply MLE if model == MODEL1: out_langs = [choose_lang(token, lang, lidder.langs, tag, ratio, lowmethod, unkmethod, False) for token, tag, lang, ratio in zip(tokens_lower, tags, out_langs, ratios)] # Carry over NO_LANG labels from the gold standard if gold_langs: out_langs = [out_lang if gold_lang != NO_LANG else NO_LANG for out_lang, gold_lang in zip(out_langs, gold_langs)] # Truncate to one char out_langs = [lang[0] if lang else UNKNOWN_LANG[0] for lang in out_langs] # Output tokens if not quiet: out_tokens = ([(token, "{0:1.3f}".format(ratio)) for token, ratio in zip(tokens, ratios)] if show_ratio else zip(tokens, out_langs)) print >> outfile, " ".join(["/".join(token_pair) for token_pair in out_tokens]) # If it isn't annotated, skip over scoring and go to the next tokens if not annotated: continue # Scoring! # First, tokens if token_eval: # Individual tokens for pred_lang, gold_lang, token in zip(out_langs, gold_langs, tokens_lower): # Clean gold_lang of entities gold_lang_clean = clean_entities(gold_lang) if (gold_lang_clean not in (NO_LANG, 'o') and pred_lang != NO_LANG and (not ignore_entities or not contains_entity(gold_lang))): token_acc.score(pred_lang, gold_lang_clean, token.lower()) fold_token_acc.score(pred_lang, gold_lang_clean, token.lower()) # Codeswitch points last_pred_lang = None last_gold_lang = None last_token = None for pred_lang, gold_lang, token in zip(out_langs, gold_langs, tokens_lower): # Skip non-linguistic tokens if gold_lang not in VALID_CS_LANGS: continue # Score if we have a valid last token if last_gold_lang is not None: # True label is whenever the language changes, but don't predict codeswitching # if one of the langs was unknown. Since the label's been truncated, we take # the first char of UNKNOWN_LANG. pred_cs = (pred_lang != UNKNOWN_LANG[0] and last_pred_lang != UNKNOWN_LANG[0] and pred_lang != last_pred_lang) gold_cs = gold_lang != last_gold_lang cs_boundaries.score(pred_cs, gold_cs, (last_token, token)) # Update last langs/token last_pred_lang = pred_lang last_gold_lang = gold_lang last_token = token # Next, messages # Compute a gold_lid if we don't know it already if not gold_lid: gold_valid_langs = _valid_langs_set(gold_langs) gold_lid = list(gold_valid_langs)[0] if len(gold_valid_langs) == 1 else MULTIPLE_LANGS if gold_lid != MULTIPLE_LANGS: # One lang means we should check lid accuracy twoway_lid_acc.score(output_lang, gold_lid) cs_perf.score(verdict, False) else: # Multiple langs means we should check for codeswitching cs_perf.score(verdict, True) # Always record all-way LID all_lid_acc.score(output_lang, gold_lid) # Repeat not unk if gold_lid != UNKNOWN_LANG: if gold_lid != MULTIPLE_LANGS: # One lang means we should check lid accuracy nounk_twoway_lid_acc.score(output_lang, gold_lid) nounk_cs_perf.score(verdict, False) else: # Multiple langs means we should check for codeswitching nounk_cs_perf.score(verdict, True) # Always record all-way LID nounk_all_lid_acc.score(output_lang, gold_lid) # Track fold accuracy fold_accuracies.append(fold_token_acc.accuracy) if annotated: output = sys.stderr print >> output, '*' * 10 + "All data evaluation" + '*' * 10 print >> output, "All message LID:" print >> output, all_lid_acc print >> output, all_lid_acc.confusion_matrix() print >> output print >> output, "Non-codeswitched message LID:" print >> output, twoway_lid_acc print >> output, twoway_lid_acc.confusion_matrix() print >> output print >> output, "Message CS:" print >> output, cs_perf print >> output, cs_perf.confusion_matrix() print >> output print >> output, '*' * 10 + "No unknown lang data evaluation" + '*' * 10 print >> output, "All message LID:" print >> output, nounk_all_lid_acc print >> output, nounk_all_lid_acc.confusion_matrix() print >> output print >> output, "Non-codeswitched message LID:" print >> output, nounk_twoway_lid_acc print >> output, nounk_twoway_lid_acc.confusion_matrix() print >> output print >> output, "Message CS:" print >> output, nounk_cs_perf print >> output, nounk_cs_perf.confusion_matrix() print >> output if token_eval: print >> output, '*' * 10 + "Token by token evaluation" + '*' * 10 print >> output, "Token-by-token LID:" print >> output, "Low method:", lowmethod if model != MODEL1: # Model 1 doesn't actually do unk attachment print >> output, "Unk method:", unkmethod print >> output, token_acc print >> output, token_acc.confusion_matrix() print >> output print >> output, "Codeswitching boundaries:" print >> output, cs_boundaries print >> output, cs_boundaries.confusion_matrix() print >> output if not quiet and verbose: for gold, subdict in token_acc.confusion.items(): for pred, errors in subdict.items(): if gold == pred or not errors: continue print >> output, '*' * 40 print >> output, "Gold:", gold, "Pred:", pred for error in sorted(set(errors)): print >> output, error print >> output # Report average fold accuracy if needed if len(fold_accuracies) > 1: mean_accuracy = sum(fold_accuracies) / len(fold_accuracies) print >> output, "Fold token accuracies: " + ", ".join("%.4f" % acc for acc in fold_accuracies) print >> output, "Mean token accuracy across folds: %.4f" % mean_accuracy return (all_lid_acc, twoway_lid_acc, cs_perf, nounk_all_lid_acc, nounk_twoway_lid_acc, nounk_cs_perf, token_acc)