def TestRuleSystem (lts_learner, lts_rules, verbose=False): all_results = [] for alignment_pathname in prog_args: if os.path.exists (alignment_pathname): t1 = time.time() lts_measurer = Measurer() lts_measurer.InitWithAlignments (alignment_pathname) if verbose: print 'Reading alignments from %s in %3.2f s' %(alignment_pathname, time.time()-t1) all_results.append (lts_measurer.MeasurePredictionAccuracy (lts_learner, lts_rules, verbose=True)) if len (all_results) > 0: total_words_correct = sum (column (column (all_results))) total_words_tested = sum (column (column (all_results), 1)) total_chars_correct = sum (column (column (all_results,1))) total_chars_tested = sum (column (column (all_results,1), 1)) word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested) char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested) if verbose: print ' Threshold %i' %(threshold_value) print ' Total words correct: %6.3f %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested) print ' Total chars correct: %6.3f %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested) print '--------------------\n' return (total_words_correct, total_words_tested), (total_chars_correct, total_chars_tested)
def WriteOutLetterToSoundSolutions (outfile, word_counter, charseq, phoneseq, solutions, include_failures=False): word = string.join (charseq, '') pron = string.join (phoneseq, ' ') if solutions: for cnt, (score, soln) in enumerate (solutions): total_score = reduce ((lambda x,y: x+y), column(soln,2)) outfile.write (' %5i. Soln %i %5.0f %20s : %-40s |' \ %(word_counter, cnt+1, total_score, word, pron)) for (source, target, score) in soln: num_chars = 1 + int (math.log (max(1.0,score),10.0)) spacer = ' ' * max (0, (5 - num_chars)) if len(target) == 0: target = ('_') outfile.write (' %2s [%i] %s -> %-6s' \ %(string.join(source,''), score, spacer, string.join(target) )) outfile.write('\n') elif include_failures: outfile.write (' %5i. FAILED %5s %20s : %-40s |' %(word_counter, '', word, pron)) outfile.write (' %2s -> %-5s\n' %(word, '')) pass
def ComputeAccuracyVsRulesCurve (lts_learner, max_threshold = 9999999999): rule_counts_list = list (set (column (lts_learner.GetRulesByCount()))) rule_counts_list.sort() print '%6s %8s %8s' %('Thresh', 'Rules', 'Chars') for thr in rule_counts_list: lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (thr) filename = 'rules/lexrules_%s.scm' %(string.zfill(num_lts_rules,5)) print '%6i %8i %8i' %(thr, num_lts_rules, num_lts_chars) lts_learner.WriteOutFestivalRules (filename, 'cmu_us', lts_rules) sys.exit() #new_lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (3) #lts_learner.WriteOutFestivalRules ('afile', new_lts_rules) print '%4s %6s %6s %8s %8s' %('Num', 'Thresh', 'Rules', 'Words', 'Chars') for i, threshold in enumerate (rule_counts_list): if threshold > max_threshold: continue lts_rules, num_lts_rules, num_lts_chars = lts_learner.ThresholdRules (threshold) word_perf, char_perf = TestRuleSystem (lts_learner, lts_rules) word_percent_correct = 100.0 * word_perf[0] / word_perf[1] char_percent_correct = 100.0 * char_perf[0] / char_perf[1] print '%4i %6i %6i %8.3f %8.3f ' \ %(i+1, threshold, num_lts_rules, word_percent_correct, char_percent_correct), \ word_perf, char_perf print
def LaunchServer(): # Delete the launch notification file if it happens to be hanging around # from a previous run (it shouldn't but could if the server was killed.) if os.path.exists (Launch_Notice_Filename): os.remove (Launch_Notice_Filename) # Define flag list and get command line option. # Note: either --festdict or --lexlist is required flag_list = [('help', 'this command'), ('port=', 'IP port number for communication with xml server (default 8000)'), ('workdir=', 'specifies the working directory to use (defaults to current)'), ('phoneset=', 'filename of list of phonemes'), ('prompts=', 'filename of festival-format prompt list'), ('festdict=', 'filename of festival-format pronunciation dictionary'), ('lexlist=', 'filename of word list with word frequency counts'), ('ignore=', 'filename of festival-format list of words to ignore (eg. those lexlearned previously)'), ('allowables=', 'filename of (partial) list of LTS allowables (optional)'), ('newpronuns=', 'filename of existing list of word pronunciations (optional)'), ('rules=', 'filename of a pickled rule set that has previously been learned')] try: opt_list, prog_args = getopt.getopt (sys.argv[1:], '', column(flag_list)) option_tbl = dict (opt_list) except getopt.GetoptError, msg: print 'Error', msg sys.exit(' type --help for program options\n')
def PredictMultipleWordPronuns (self, given_charseq, verbose=False): # ---------------------------------------------------------------------------------------------- def FindAllSolutions (prediction): def DescendOneLevel (curr_solution, level): if level >= len (prediction): all_solutions.append (curr_solution[:]) curr_solution.pop() return letter, best_phoneseq, all_phoneseqs = prediction [level] for one_phoneseq, score, rule_num in all_phoneseqs: curr_solution.append ((one_phoneseq, score)) DescendOneLevel (curr_solution, level+1) if curr_solution: curr_solution.pop() pass pronun_list = [] all_solutions = [] DescendOneLevel ([], 0) for one_soln in all_solutions: rhs_seq = column (one_soln, 0) phoneseq = [] for rhs in rhs_seq: if rhs: phoneseq.append (string.join(rhs)) score = sum (column (one_soln, 1)) pronun_list.append ((score, string.join(phoneseq))) # sort words by score in _ascending_ order because lower counts # represent more specialzed conditions identified by letter context. pronun_list.sort (reverse=False) answer = [(pronun, count) for (count, pronun) in pronun_list] return answer pass pred_phoneseq, prediction = self.PredictOneWordPronun (given_charseq) all_solutions = FindAllSolutions (prediction) if verbose and len (all_solutions) > 0: for soln in all_solutions: print 'Word soln:', soln print # do this because FindAllSolutions doesn't get pronunciations # when some of the letters go to '?' answer = [string.join(pred_phoneseq)] + column (all_solutions[1:]) return answer
def WeightedAve (given_list): answer = 0.0 total_count = sum (column (given_list)) for item_info in given_list: item_count = item_info[0] item_value = item_info[1] item_weight = float(item_count) / float(total_count) answer += item_weight * item_value return answer
def FindAllSolutions (prediction): def DescendOneLevel (curr_solution, level): if level >= len (prediction): all_solutions.append (curr_solution[:]) curr_solution.pop() return letter, best_phoneseq, all_phoneseqs = prediction [level] for one_phoneseq, score, rule_num in all_phoneseqs: curr_solution.append ((one_phoneseq, score)) DescendOneLevel (curr_solution, level+1) if curr_solution: curr_solution.pop() pass pronun_list = [] all_solutions = [] DescendOneLevel ([], 0) for one_soln in all_solutions: rhs_seq = column (one_soln, 0) phoneseq = [] for rhs in rhs_seq: if rhs: phoneseq.append (string.join(rhs)) score = sum (column (one_soln, 1)) pronun_list.append ((score, string.join(phoneseq))) # sort words by score in _ascending_ order because lower counts # represent more specialzed conditions identified by letter context. pronun_list.sort (reverse=False) answer = [(pronun, count) for (count, pronun) in pronun_list] return answer
alignment_file_pathname = option_tbl.get('--align','') #for alignment_pathname in prog_args: for alignment_pathname in [alignment_file_pathname]: if os.path.exists (alignment_pathname): t1 = time.time() lts_measurer = Measurer() lts_measurer.InitWithAlignments (alignment_pathname) print 'Reading alignments from %s in %3.2f s\n' %(alignment_pathname, time.time()-t1) #all_results.append (TestRuleLearner (lts_learner, lts_measurer)) lts_measurer.WriteWagonTrainingData('wagon.data') if len (all_results) > 1: total_words_correct = sum (column (column (all_results))) total_words_tested = sum (column (column (all_results), 1)) total_chars_correct = sum (column (column (all_results,1))) total_chars_tested = sum (column (column (all_results,1), 1)) word_percent_correct = 100.0 * total_words_correct / max (1, total_words_tested) char_percent_correct = 100.0 * total_chars_correct / max (1, total_chars_tested) print ' Total words correct: %6.3f %8i %8i' %(word_percent_correct, total_words_correct, total_words_tested) print ' Total chars correct: %6.3f %8i %8i' %(char_percent_correct, total_chars_correct, total_chars_tested) print '--------------------\n' # --measure_curve elif option_tbl.has_key ('--measure_curve') and option_tbl.has_key ('--rules'): ComputeAccuracyVsRulesCurve (lts_learner)
from Column import column # ============== # Mainline code. # ============== if __name__ == "__main__": flag_list = [ ("help", "this command"), ("festdict=", "filename of festival-format pronunciation dictionary"), ("janusdict=", "filename of janus-format pronunciation dictionary"), ] try: opt_list, prog_args = getopt.getopt(sys.argv[1:], "", column(flag_list)) option_tbl = dict(opt_list) except getopt.GetoptError, msg: print "Error", msg sys.exit(" type --help for program options\n") # Print out the options if requested. if option_tbl.has_key("--help") or len(sys.argv) == 1: print "python", sys.argv[0] for option_flag, description in flag_list: print "%12s %s" % (option_flag, description) print sys.exit() # Read in the festival format dictionary and convert it to janus format.
uncovered_words = {} for prompt_name, prompt in sorted(recorded_prompt_tbl.items()): word_list = prompt.split() total_prompt_tokens += len(word_list) for raw_word in word_list: word = DictionaryIO.TrimExternalPunctuation(raw_word).lower() total_prompt_words.add(word) if word in words_with_pronuns: prompt_tokens_covered += 1 else: uncovered_words[word] = uncovered_words.get(word, 0) + 1 total_token_count = sum(column(word_freq_list, 2)) covered_prompt_words = len(total_prompt_words) - len(uncovered_words) corpus_word_percent = 100.0 * len(word_pronun_counts) / float(max(1, (len(word_freq_list)))) corpus_token_percent = 100.0 * sum(word_pronun_counts.values()) / float(max(1, total_token_count)) prompt_token_percent = 100.0 * prompt_tokens_covered / float(max(1, total_prompt_tokens)) prompt_word_percent = 100.0 * covered_prompt_words / float(max(1, len(total_prompt_words))) print " corpus word coverage: %6i / %-6i (%3.2f)" % ( len(word_pronun_counts), len(word_freq_list), corpus_word_percent, ) print " corpus token coverage: %6i / %-6i (%3.2f)" % ( sum(word_pronun_counts.values()), total_token_count, corpus_token_percent,
print 'Warning: %s is not a utf-8 file, trying again as iso-8859' %(dict_pathname) infile.close() infile = codecs.open (dict_pathname, 'r', 'iso8859') ReadFile (infile) infile.close() # convert the word count table to a list word_freq_list = [] for word in sorted (word_freq_table): word_freq_list.append ([word, '', word_freq_table[word]]) # then either extract only the words or sort the contents by frequency if words_only: return column (word_freq_list, 0) elif sorted_by_count: sorted_by_count = sorted ([(v, k) for k, v in word_freq_table.items()], reverse=True) sorted_by_count = [[v, '', k] for k, v in sorted_by_count] return sorted_by_count else: return word_freq_list pass # --------------------------------------------------------------------------------------------------
def ReadFestivalDictionary (dict_pathname, words_as_charseq = False, words_only = False, reverse_direction = False, character_encoding = 'auto'): # -------------------------------------------------------------------------- def DetermineEncoding(): infile = file (dict_pathname, 'r') for line in infile: for ch in line: if ch not in string.printable: infile.close() return 'utf-8' return 'ascii' if character_encoding == 'auto': character_encoding = DetermineEncoding() if character_encoding != 'ascii': dec = codecs.getdecoder (character_encoding) enc = codecs.getencoder (character_encoding) word_pronun_list = [] word_pronun_table = {} phone_number_tbl = {} char_mapping = string.maketrans ('()"',' ') infile = file (dict_pathname, 'r') #infile = codecs.open (dict_pathname, 'r', 'utf-8') for cnt, rawline in enumerate (infile.readlines()): line = string.strip (rawline.strip()[1:-1]) if not line: continue num_left_parens = line.count('(') pronun_part = '' # Case 1. the word is given as a string if num_left_parens == 1: quote_pos = string.rfind (line, '"') + 2 word_part = string.translate (line[:quote_pos], char_mapping) rest_part = string.translate (line[quote_pos:], char_mapping) fields = string.split (rest_part, maxsplit=1) #print cnt+1, len(fields), quote_pos, character_encoding, rest_part if len(fields) >= 2: if character_encoding == 'ascii': word_letters = list (string.strip(word_part)) else: try: word_unicode = dec (string.strip(word_part)) word_letters = list (word_unicode[0]) assert len(word_letters) == enc(word_unicode[0])[1] except: #print 'SKIPPING', cnt+1, line continue annotation = fields[0] pronun_part = fields[1] # Case 2. the word given as a character sequence elif num_left_parens == 2: pos1 = line.find ('(') pos2 = line.find (')') pos3 = line.rfind ('(') pos4 = line.rfind (')') letter_part = line[pos1:pos2].translate (char_mapping) pronun_part = line[pos3:pos4].translate (char_mapping) annotation = line[pos2-1:pos3-1].strip() word_letters = letter_part.split() if not pronun_part: continue # check for duplicates before adding words to the pronunciation list word_string = string.join (word_letters,'') word_phones = tuple (pronun_part.split()) word_letters = tuple (word_letters) if words_as_charseq: word = word_letters else: word = word_string # This adds a default word count of 1, and allows only unique words if not word_pronun_table.has_key(word): if not reverse_direction: word_pronun_list.append ([word, word_phones, 1]) else: word_pronun_list.append ([word_phones, word, 1]) word_pronun_table [word] = True """ if not reverse_direction: word_pronun_list.append ([word_letters, word_phones, annotation]) else: word_pronun_list.append ([word_phones, word_letters, annotation]) """ infile.close() if words_only: return column (word_pronun_list, 0) else: return word_pronun_list
def GetFullWordsOnlyList (self): return column (self.word_pronuns_list)
def PrintCondensedList (production_list): # create table that maps lhs -> all rhs productions, with counts lts_productions = {} for cnt, (usage_count, production) in enumerate (sorted_production_list): lhs = production[0] rhs = production[1] lts_productions [lhs] = lts_productions.get(lhs,[]) + [(rhs, usage_count)] # write out the results alphabetically outfile.write ('LTS Productions organized by letter:\n') lhs_keys = lts_productions.keys() lhs_keys.sort (cmp = LatinCharacterSet.OrderLetters) # First pass. production_info = [] for i, lhs in enumerate (lhs_keys): rhs_productions = lts_productions[lhs] rhs_prod_counts = column (rhs_productions,1) rhs_prod_list = column (rhs_productions,0) rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter) num_productions = sum (rhs_prod_counts) entropy = TinyStats.ComputeEntropyFromSymbolCounts (rhs_productions) perplexity = math.pow (2.0, entropy) production_info.append ((num_productions, perplexity, len(rhs_display_str)) ) try: max_rhs_string_length = max (column (production_info,2)) except: max_rhs_string_length = 1 formatting_string = string.replace ('%4i. %6i %6.3f %4s -> %-Xs', 'X', str(max_rhs_string_length+2)) # Second pass. for i, lhs in enumerate (lhs_keys): rhs_productions = lts_productions[lhs] rhs_prod_counts = column (rhs_productions,1) rhs_prod_list = column (rhs_productions,0) rhs_display_str = ConvertPhoneSeqToString (rhs_prod_list, phone_name_converter) num_productions, perplexity = production_info[i][:2] outfile.write ('%4i' %(i+1)) try: outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs, rhs_display_str)) except UnicodeEncodeError: lhs_utf = utf(lhs)[0] outfile.write (formatting_string %(i+1, num_productions, perplexity, lhs_utf, rhs_display_str)) for j, prod_count in enumerate (rhs_prod_counts): #if j > 3: continue outfile.write ('%6i' %(prod_count)) outfile.write('\n') outfile.write('\n') perplexity_list = column (production_info,1) if perplexity_list: min_perplexity = min (perplexity_list) max_perplexity = max (perplexity_list) else: min_perplexity = 0 max_perplexity = 0 outfile.write ('Num letter productions: %5i\n' %(len(sorted_production_list))) outfile.write ('Min letter perplexity: %6.3f\n' %(min_perplexity)) outfile.write ('Max letter perplexity: %6.3f\n' %(max_perplexity)) outfile.write ('Ave letter perplexity: %6.3f\n' %(WeightedAve(production_info))) outfile.write ('\n') return lts_productions
def GetLetterStats (self): return column (self.ngram_stats)
def WriteOutRules (output_filename, lts_rule_system, phone_name_converter = {}, include_sorted_rule_list = False): # -------------------------------------------------------------------------- # Added this to protect from automatic conversion to unicode through xml-rpc # This is just a short term fix before fully supporting UTF-8. def Enc (given_string, encoding = 'latin-1'): if type (given_string) == type ('ascii'): return given_string else: return given_string.encode (encoding) # case 1. open a file suitable for writing utf-8 strings # case 2. most likely this is sys.stdout if type (output_filename) == type('string'): outfile = codecs.open (output_filename, 'w', 'utf-8') else: outfile = output_filename rule_count = 0 lhs_symbol_list = lts_rule_system.keys() lhs_symbol_list.sort (cmp = LatinCharacterSet.OrderLetters) lhs_perplexity_list = [] # find the right width for the rhs part M = max_rhs_symbol_length = 0 sorted_rule_list = [] for lhs in lhs_symbol_list: lts_rule_chain = lts_rule_system[lhs] for rule_context, rhs_symbol_seq, application_count in lts_rule_chain: rhs_string = string.join (rhs_symbol_seq,'-') #M = max (M, len(Enc(rhs_string))) M = max (M, len(utf(rhs_string))) sorted_rule_list.append ((application_count, lhs, rhs_symbol_seq, rule_context)) format_str = string.replace ('%6i. %2s -> %Xs / %s [%i] %s', 'X', str(M)) outfile.write ('LTS Rule System:\n') outfile.write ('%6s %6s\n' %('Count','Perplexity')) for lhs in lhs_symbol_list: lts_rule_chain = lts_rule_system[lhs] entropy = TinyStats.ComputeEntropyFromSymbolCounts (column (lts_rule_chain,1,3)) perplexity = math.pow (2.0, entropy) application_total = sum (column (lts_rule_chain,2)) lhs_perplexity_list.append ((application_total, perplexity)) outfile.write ('%6i %6.3f' %(len(lts_rule_chain), perplexity)) for rule_context, rhs_symbol_seq, application_count in lts_rule_chain: rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs_symbol_seq) rule_count += 1 lhs_symbol = rule_context[1] context_str = rule_context[0] + '_' + rule_context[2] rhs_string = string.join (rhs_symbol_seq,'-') num_chars = 1 + int (math.log (max(1.0,application_count),10.0)) spacer = ' ' * max (0, (10 - num_chars - len(context_str))) if rhs_string == '': rhs_string = '_' lhs_utf = utf(lhs_symbol)[0] rhs_utf = utf(rhs_string)[0] ctx_utf = utf(context_str)[0] #outfile.write (format_str %(rule_count, Enc(lhs_symbol), Enc(rhs_string), Enc(context_str), application_count, spacer)) #outfile.write (format_str %(rule_count, lhs_utf, rhs_utf, ctx_utf, application_count, spacer)) outfile.write (format_str %(rule_count, lhs_symbol, rhs_string, context_str, application_count, spacer)) outfile.write('\n') perplexity_list = column (lhs_perplexity_list,1) if perplexity_list: min_perplexity = min (perplexity_list) max_perplexity = max (perplexity_list) else: min_perplexity = 0 max_perplexity = 0 outfile.write ('\n') outfile.write ('Number of LTS rules: %i\n' %(rule_count)) outfile.write ('Min lts rule perplexity: %6.3f\n' %(min_perplexity)) outfile.write ('Max lts rule perplexity: %6.3f\n' %(max_perplexity)) outfile.write ('Ave lts rule perplexity: %6.3f\n' %(WeightedAve(lhs_perplexity_list))) outfile.write ('\n') if include_sorted_rule_list: sorted_rule_list.sort (reverse=True) outfile.write ('Rules sorted by count:') for i, (rule_count, lhs, rhs, rule_context) in enumerate (sorted_rule_list): rhs_symbol_seq = map ((lambda x: phone_name_converter.get(x,x)), rhs) context_str = rule_context[0] + '_' + rule_context[2] rhs_string = string.join (rhs_symbol_seq,'-') outfile.write ('\n') outfile.write (format_str %(i+1, lhs, rhs_string, context_str, rule_count, '')) outfile.write ('\n\n')
def FindCoverageWordList (sorted_rule_list): # ------------------------------------------------------------------------------------------ def FindOneWord (given_rule_list, rules_not_covered, test_size_threshold = 1): prev_word_set = set() cand_word_set = set() local_rule_list = given_rule_list[:] #local_rule_list.reverse() for i, (count, rule) in enumerate (local_rule_list): if rule not in rules_not_covered: continue if not cand_word_set: cand_word_set = rule_word_id_sets [rule] else: cand_word_set = cand_word_set.intersection (rule_word_id_sets[rule]) if cand_word_set: prev_word_set = cand_word_set # save the last non-empty set if len(cand_word_set) <= test_size_threshold: break else: cand_word_set = prev_word_set # restore the candidate set to be non-empty # alternate halt strategry that is # not as effective as the above # if not cand_word_set: break # prev_word_set = cand_word_set best_id, best_score, best_rules, rule_counts = FindBestWordInSet (prev_word_set, rules_not_covered, verbose=False) best_word = string.join (given_word_pronun_list[best_id][0],'') cand_word_set = rule_word_id_sets[rule] n = len (sorted_word_id_list) + len (best_word_id_list) + 1 print 'Word %6i %5i %8.2f %4i %4i %s' %(n, len(best_word), best_score, len(best_rules), len(rules_not_covered), best_word), best_rules return best_id, rule_counts pass #--- Set_Size_Threshold = 100 # old # non_covered_rules = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items()))) non_covered_rules = set() for count, rule in sorted_rule_list: rule_lhs, rule_rhs_index = rule if rule_rhs_index == 1 and rule_word_id_sets[rule]: non_covered_rules.add (rule) best_word_id_set = set() best_word_id_list = [] # pass 1 loop_count = 0 num_reinsertions = 0 while non_covered_rules: loop_count += 1 if loop_count >= 510: return best_word_id_list # jmk!!! print '%6i. Searching for %i rules among %i ...' %(loop_count, len(non_covered_rules), len(sorted_rule_list)) # resort the rules sorted_rule_list = [(count, rule) for (rule, count) in rule_count_table.items()] sorted_rule_list.sort (reverse=True) j = 0 for count, rule in sorted_rule_list: if rule in non_covered_rules: j += 1 print '%6i looking %6i %s' %(j, count, rule) print best_word_id, rules_covered = FindOneWord (sorted_rule_list, non_covered_rules, Set_Size_Threshold) best_word_id_set.add (best_word_id) best_word_id_list.append (best_word_id) UpdateWordIdSets (best_word_id, rules_covered.keys()) for rule, count in rules_covered.items(): selected_words_rule_count_tbl [rule] = selected_words_rule_count_tbl.get(rule,0) + count word_charseq = given_word_pronun_list [best_word_id][0] word_charset = set (word_charseq) word = string.join (word_charseq,'') for lhs in word_charset: rule_chain = rule_count_lhs_table.get(lhs,[]) for i in range (1,len(rule_chain)): this_rule_total, this_rule = rule_chain[i] prev_rule_total, prev_rule = rule_chain[i-1] this_rule_count = selected_words_rule_count_tbl.get (this_rule,0) prev_rule_count = selected_words_rule_count_tbl.get (prev_rule,0) print 'here %2s %4i %6i %8s %12s %-2i %6i' \ %(lhs, i, prev_rule_count, prev_rule, this_rule, this_rule_count, this_rule_total), if this_rule_count == 0: if this_rule not in non_covered_rules: non_covered_rules.add (this_rule) print ' adding this', this_rule, if prev_rule_count <= 1: non_covered_rules.add (prev_rule) rule_count_table [prev_rule] = this_rule_total + 1 print ' adding prev', prev_rule, print break elif prev_rule_count <= this_rule_count and prev_rule not in non_covered_rules: non_covered_rules.add (prev_rule) rule_count_table [prev_rule] = this_rule_total + 1 print ' adding prev', prev_rule break else: print print """ for word_rule, word_count in rules_covered.items(): rule_lhs, rule_rank = word_rule rule_chain = rule_count_lhs_table [rule_lhs] N = len (rule_chain) next_rule = '' for i, (occurance_count, rule) in enumerate (rule_chain): if rule == word_rule: if i+1 < N: next_rule = rule_chain[i+1][1] break rule_already_wanted = next_rule in non_covered_rules if next_rule and not rule_already_wanted: non_covered_rules.add (word_rule) non_covered_rules.add (next_rule) this_word_count = selected_words_rule_count_tbl.get (word_rule,0) next_word_count = selected_words_rule_count_tbl.get (next_rule,0) print 'here %4i %s %6s %s %6i %6i %s' \ %(len(non_covered_rules), rule, rule_already_wanted, next_rule, this_word_count, next_word_count, word) """ """ for rule in rules_covered.keys(): selected_rule_count = selected_words_rule_count_tbl [rule] #print '%6i %6i %2i %s' %(rule_count_table[rule], selected_rule_count, count, rule) # this is inefficient! for compare_count, compare_rule in sorted_rule_list: if rule[0] == compare_rule[0]: if rule_count_table [rule] < \ rule_count_table [compare_rule] \ and \ selected_words_rule_count_tbl [rule] >= \ selected_words_rule_count_tbl.get(compare_rule,0): num_reinsertions += 1 non_covered_rules.add (compare_rule) #print ' re-adding rule %6i %6i %s' %(count, rule_count_table [compare_rule], compare_rule) """ pass print 'Num reinsertions', num_reinsertions return best_word_id_list # Note: pass 2 may not work with this revised algorithm, so skip # pass 2 non_covered_rules = set (column (filter ((lambda x: len(x[1]) > 0), rule_word_id_sets.items()))) total_letter_count = 0 total_rules_covered = set() accumulated_score = 1.0 new_word_id_list = [] while best_word_id_set: word_id, word_score, word_rules, rule_counts = FindBestWordInSet (best_word_id_set, non_covered_rules) best_word_id_set.discard (word_id) new_word_id_list.append (word_id) UpdateWordIdSets (word_id, word_rules) list_size = len (new_word_id_list) entry = given_word_pronun_list [word_id] word = string.join (entry[0],'') total_letter_count += len(word) total_rules_covered |= word_rules current_merit = sum (map ((lambda x: rule_count_table[x]), total_rules_covered)) current_score = 1.0 - float(current_merit) / maximum_merit accumulated_score += current_score * len(word) print '%4i %5i %8.2f %4i %4i %8i %8.3f %8.3f %s' \ %(list_size, total_letter_count, word_score, len(word_rules), len(total_rules_covered), current_merit, current_score, accumulated_score, word) if not non_covered_rules: break print return new_word_id_list