def output_trial_data(lexicon, wordpath, outpath, irregulars, ag_items): """Augment trial data with information about each word.""" word_fields = ['word', 'analysis', 'root', 'n.suffixes', 'suffix', 'n.syll', 'n.phon', 'OLD', 'PLD', 'sbtlx.freq', 'hal.freq', 'celex.freq', 'kf.freq', 'sbtlx.basefreq', 'hal.basefreq', 'celex.basefreq', 'kf.basefreq', 'sbtlx.suffixfreq', 'hal.suffixfreq', 'celex.suffixfreq', 'kf.suffixfreq', 'sbtlx.pformbase', 'hal.pformbase', 'celex.pformbase', 'kf.pformbase', 'sbtlx.wordrank', 'sbtlx.baserank', 'sbtlx.freqgreaterroot', 'ag.kf.clusterfreq', 'length', 'proper', 'bare', 'bimorph', 'inflectional', 'derivational', 'irreg', 'in.ag', 'ag.exp', 'inflect.family', 'exclude'] # Open input try: infile = open(wordpath, 'rU') reader = csv.DictReader(infile) except IOError: print >> sys.stderr, "Couldn't open input file at", outpath sys.exit(1) # Open output try: outfile = open(outpath, 'wb') writer = csv.DictWriter(outfile, word_fields) except IOError: print >> sys.stderr, "Couldn't open output file at", outpath sys.exit(1) # Write out header and data writer.writeheader() nrows = 0 nexclusions = 0 for row in reader: try: word = lexicon[row['Word']] except KeyError: nexclusions += 1 continue # Skip words that didn't appear in the original source if word.fake: continue proper = word.text[0].isupper() nsuffixes = len(word.suffixes) if word.suffixes else '0' suffix = word.suffixes[-1] if word.suffixes else "null" bare = True if not word.suffixes and not word.prefixes else False bimorph = True if (word.suffixes and not word.prefixes and len(word.suffixes) == 1) else False irreg = word.text in irregulars and not irregulars[word.text].compound in_ag = word.text in ag_items ag_kf_clusterfreq = ag_items[word.text].kf_cluster_freq if word.text in ag_items else None ag_exp = ag_items[word.text].experiment if word.text in ag_items else None # Whether it is in an inflectional family of size > 1 inflect_family = ((word.root in lexicon.inflect_sets) and (word.text in lexicon.inflect_sets[word.root]) and (len(lexicon.inflect_sets[word.root]) > 1)) # Whether the frequency is greater than the root, which can be T/F or NA sbtlx_freq_greater_root = lexicon.freq_greater_root_sbtlx(word) # Whether we just don't like this item exclude = exclude_item(word.text) # Create the row row = dict(( ('word', word.text), ('analysis', word.analysis), ('root', na_none(word.root)), ('n.suffixes', nsuffixes), ('suffix', na_none(suffix)), ('n.syll', na_null(row['NSyll'])), ('n.phon', na_null(row['NPhon'])), ('length', word.length), ('OLD', na_null(row['OLD'])), ('PLD', na_null(row['PLD'])), ('suffix', na_none(suffix)), ('sbtlx.freq', word.freq_sbtlx), ('hal.freq', word.freq_hal), ('celex.freq', word.freq_celex), ('kf.freq', word.freq_kf), ('sbtlx.basefreq', na_none(lexicon.base_freq_sbtlx(word.root))), ('hal.basefreq', na_none(lexicon.base_freq_hal(word.root))), ('celex.basefreq', na_none(lexicon.base_freq_celex(word.root))), ('kf.basefreq', na_none(lexicon.base_freq_kf(word.root))), ('sbtlx.suffixfreq', na_none(lexicon.suffix_freq_sbtlx(suffix))), ('hal.suffixfreq', na_none(lexicon.suffix_freq_hal(suffix))), ('celex.suffixfreq', na_none(lexicon.suffix_freq_celex(suffix))), ('kf.suffixfreq', na_none(lexicon.suffix_freq_kf(suffix))), ('sbtlx.pformbase', na_none(lexicon.p_form_base_sbtlx(word))), ('hal.pformbase', na_none(lexicon.p_form_base_hal(word))), ('celex.pformbase', na_none(lexicon.p_form_base_celex(word))), ('kf.pformbase', na_none(lexicon.p_form_base_kf(word))), ('sbtlx.wordrank', na_none(lexicon.word_rank(word))), ('sbtlx.baserank', na_none(lexicon.base_rank(word.root))), ('sbtlx.freqgreaterroot', convert_r_bool(sbtlx_freq_greater_root)), ('ag.kf.clusterfreq', na_none(ag_kf_clusterfreq)), ('proper', convert_r_bool(proper)), ('bare', convert_r_bool(bare)), ('bimorph', convert_r_bool(bimorph)), ('inflectional', convert_r_bool(word.inflectional)), ('derivational', convert_r_bool(word.derivational)), ('irreg', convert_r_bool(irreg)), ('in.ag', convert_r_bool(in_ag)), ('ag.exp', na_none(ag_exp)), ('inflect.family', convert_r_bool(inflect_family)), ('exclude', convert_r_bool(exclude)), )) writer.writerow(row) nrows += 1 outfile.close() print "Excluded %d items." % nexclusions print "Wrote %d rows to %s." % (nrows, outpath)
def main(): """Output data on irregular forms.""" parser = argparse.ArgumentParser(description=main.__doc__) parser.add_argument('irregulardata', help='CSV file containing irregular verb information') parser.add_argument('subtlexpath', help='SUBTLEX frequency norms') parser.add_argument('outputpath', help='output file') args = parser.parse_args() subtlexpath = args.subtlexpath irregpath = args.irregulardata outpath = args.outputpath print "Loading SUBTLEX frequency data..." subtlex = SubtlexDict(subtlexpath) print "Loaded frequency information for %d words from %s." % (len(subtlex), repr(subtlexpath)) print "Loading irregulars..." irregulars = parse_irregulars(irregpath) print "Read %d irregular verbs from %s." % (len(irregulars), repr(irregpath)) # Get canonical frequencies freqs = {} for word in irregulars: try: freqs[word] = subtlex[word].freq_count_low except KeyError: try: freqs[word] = subtlex[word.capitalize()].freq_count_low except KeyError: freqs[word] = 0 # Compute irregular counts irregular_rule_counts = count_irregular_rules(irregulars, True, subtlex) irregular_rule_ranks = {rule: (idx + 1) for idx, rule in enumerate(sorted(irregular_rule_counts, key=irregular_rule_counts.get, reverse=True))} irregular_freq_ranks = {rule: (idx + 1) for idx, rule in enumerate(sorted(irregulars, key=freqs.get, reverse=True))} # Compute total ranking by class order irregular_class_freqs = {word.word: (irregular_rule_counts[word.class_key], freqs[word.word]) for word in irregulars.itervalues()} print sorted(irregular_class_freqs.items()) irregular_classfreq_ranks = {word: (idx + 1) for idx, word in enumerate(sorted(irregulars, key=irregular_class_freqs.get, reverse=True))} # Open output fields = ['word', 'pastsuffix', 'pastexclude', 'pastrulecount', 'pastrulerank', 'stemchange', 'sbtlx.freq', 'irregfreqrank', 'irregclassfreqrank'] try: outfile = open(outpath, 'wb') writer = csv.DictWriter(outfile, fields) except IOError: print >> sys.stderr, "Couldn't open output file at", outpath sys.exit(1) # Output data writer.writeheader() nrows = 0 nexclusions = 0 for word in irregulars: pastsuffix = irregulars[word].suffix stem_change = irregulars[word].stem_change_nodevoice past_exclude = irregulars[word].exclude past_rule_count = irregular_rule_counts[irregulars[word].class_key] past_rule_rank = irregular_rule_ranks[irregulars[word].class_key] row = dict(( ('word', word), ('pastsuffix', na_none(pastsuffix)), ('pastexclude', convert_r_bool(past_exclude)), ('pastrulecount', na_none(past_rule_count)), ('pastrulerank', na_none(past_rule_rank)), ('stemchange', na_none(stem_change)), ('sbtlx.freq', freqs[word]), ('irregfreqrank', irregular_freq_ranks[word]), ('irregclassfreqrank', irregular_classfreq_ranks[word]), )) writer.writerow(row) # Count rows and exclusions nrows += 1 if past_exclude: nexclusions += 1 # Clean up outfile.close() print "Marked %d items as excluded." % nexclusions print "Wrote %d rows to %s." % (nrows, outpath)