def analysewords(results, words, chars, hsk_word_count, hsk_char_count): results.append("<h4>Analysis of Words/Characters in Input</h4>") singlecharcount = len([w for w in words if len(w) == 1]) wordcount = len(words) charcount = len(chars) totalwords = sum(hsk_word_count.values()) totalchars = sum(hsk_char_count.values()) subtlexwords = subtlex_word_set & words ccedictwords = cedict_word_set & words results.append("""Input contained:<ul> <li>{} unique single-character entries</li> <li>{} unique multi-character entries</li> <li>{} unique entries</li> <li>{} total entries</li> <li>{} unique characters</li> <li>{} total characters</li> <li>{} unique words as recognised by SUBTLEX-CH</li> <li>{} unique words as recognised by CC-CEDICT</li> </ul>""".format(singlecharcount, wordcount - singlecharcount, wordcount, totalwords, charcount, totalchars, len(subtlexwords), len(ccedictwords))) wordsknown = "\n".join(frequency_order_word(words)) charsknown = "\n".join(frequency_order_char(chars)) subtlexknown = "\n".join(frequency_order_word(subtlexwords)) ccedictknown = "\n".join(frequency_order_word(ccedictwords)) results.append(blockboxtemplate().format("Unique Entries", "wordsknown", wordsknown)) results.append(blockboxtemplate().format("Unique Characters", "charsknown", charsknown)) results.append(blockboxtemplate().format("SUBTLEX Words", "subtlexknown", subtlexknown)) results.append(blockboxtemplate().format("CC-CEDICT Words", "cedictknown", ccedictknown))
def suggesthskchars(results, chars): results.append("""<h4>Suggested HSK Characters not in Input</h4>""") for i in range(1, 7): charstolearn = "\n".join(frequency_order_char(hsk_chars[i] - chars)) results.append(blockboxtemplate().format("HSK " + str(i), "hskcharstolearn" + str(i), charstolearn)) foundchars = [] for freq, char in char_frequency_ordered: if char not in chars and char not in hsk_chars[16]: foundchars.append(char) if len(foundchars) >= 1000: break charstext = "\n".join(frequency_order_char(foundchars)) results.append(blockboxtemplate().format("Non-HSK", "nonhskcharstolearn" + str(i), charstext))
def suggesthskwords(results, words): results.append("""<h4>Suggested HSK Words not in Input</h4>""") for i in range(1, 7): wordstolearn = "\n".join(frequency_order_word(hsk_words[i] - words)) results.append(blockboxtemplate().format("HSK " + str(i), "hskwordstolearn" + str(i), wordstolearn)) foundwords = [] for freq, word in word_frequency_ordered: if word not in words and word not in hsk_words[16]: foundwords.append(word) if len(foundwords) >= 1000: break wordstext = "\n".join(frequency_order_word(foundwords)) results.append(blockboxtemplate().format("Non-HSK", "nonhskwordstolearn" + str(i), wordstext))
def setresultbox(title, idname, itemset, joinchar, itemname): setlen = len(itemset) if setlen == 0: titlewithsize = title + """<span style="font-weight: lighter; font-size: 80%; font-style: italic;"> (empty)</span>""" else: titlewithsize = title + """<span style="font-weight: lighter; font-size: 80%; font-style: italic;"> ({} {}{})</span>""".format( setlen, itemname, "s" if setlen > 1 else "") return blockboxtemplate().format(titlewithsize, idname, joinchar.join(itemset))
def analysehskchars(results, chars, hsk_char_count): knownintersect = {} results.append("<h4>Analysis of HSK Characters in Input</h4>") results.append("Input contained:<ul>") cumulativeknown = {} cumulativetotal = {} cumulativeknown[0] = 0 cumulativetotal[0] = 0 numknown = {} numhsk = {} for i in range(1, 7): knownintersect[i] = chars & hsk_chars[i] numknown[i] = len(knownintersect[i]) numhsk[i] = len(hsk_chars[i]) percentknown = 100 * float(numknown[i]) / numhsk[i] cumulativeknown[i] = cumulativeknown[i - 1] + numknown[i] cumulativetotal[i] = cumulativetotal[i - 1] + numhsk[i] results.append( """<li>{} ({:.2f}%) of the {} HSK {} characters""".format( numknown[i], percentknown, numhsk[i], i)) if i > 1 > 0: cumpercentknown = 100 * float( cumulativeknown[i]) / cumulativetotal[i] results.append( """ <i>(Cumulative: {} ({:.2f}%) of the {} HSK 1-{} characters)</i>""" .format(cumulativeknown[i], cumpercentknown, cumulativetotal[i], i)) results.append("</li>") results.append("</ul>") totalunique = len(chars) if totalunique > 0: numknown_nonhsk = totalunique - cumulativeknown[6] results.append( "Of the {} <b>unique</b> characters in the input:<ul>".format( totalunique)) for i in range(1, 7): percentknown = 100 * float(numknown[i]) / totalunique results.append( """<li>{} ({:.2f}%) were HSK {} characters""".format( numknown[i], percentknown, i)) if i > 1: cumpercentknown = 100 * float(cumulativeknown[i]) / totalunique results.append( """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} characters)</i>""" .format(cumulativeknown[i], cumpercentknown, i)) results.append("</li>") numknown_nonhsk_percent = 100 * float(numknown_nonhsk) / totalunique results.append( """<li>{} ({:.2f}%) were non-HSK characters</li>""".format( numknown_nonhsk, numknown_nonhsk_percent)) results.append("</ul>") totalchars = sum(hsk_char_count.values()) if totalchars == totalunique: results.append( "<p><i>Each character appeared only once in the input.</i></p>") else: cumknown = 0 results.append( "Of the {} <b>total</b> characters that were input:<ul>".format( totalchars)) for i in range(1, 7): percentknown = 100 * float(hsk_char_count[i]) / totalchars cumknown += hsk_char_count[i] results.append( """<li>{} ({:.2f}%) were HSK {} characters""".format( hsk_char_count[i], percentknown, i)) if i > 1: cumpercentknown = 100 * float(cumknown) / totalchars results.append( """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} characters)</i>""" .format(cumknown, cumpercentknown, i)) results.append("</li>") num_nonhsk = totalchars - cumknown numknown_nonhsk_percent = 100 * float(num_nonhsk) / totalchars results.append( """<li>{} ({:.2f}%) were non-HSK characters</li>""".format( num_nonhsk, numknown_nonhsk_percent)) results.append("</ul>") for i in range(1, 7): charsknown = "\n".join(frequency_order_char(knownintersect[i])) results.append(blockboxtemplate().format("HSK " + str(i), "hskcharsknown" + str(i), charsknown)) nonhskchars = "\n".join(frequency_order_char(chars - hsk_chars[16])) results.append(blockboxtemplate().format("Non-HSK", "nonhskcharsknown", nonhskchars))