Exemplo n.º 1
0
def analysewords(results, words, chars, hsk_word_count, hsk_char_count):
    results.append("<h4>Analysis of Words/Characters in Input</h4>")
    singlecharcount = len([w for w in words if len(w) == 1])
    wordcount = len(words)
    charcount = len(chars)
    totalwords = sum(hsk_word_count.values())
    totalchars = sum(hsk_char_count.values())
    subtlexwords = subtlex_word_set & words
    ccedictwords = cedict_word_set & words
    results.append("""Input contained:<ul>
<li>{} unique single-character entries</li>
<li>{} unique multi-character entries</li>
<li>{} unique entries</li>
<li>{} total entries</li>
<li>{} unique characters</li>
<li>{} total characters</li>
<li>{} unique words as recognised by SUBTLEX-CH</li>
<li>{} unique words as recognised by CC-CEDICT</li>
</ul>""".format(singlecharcount, wordcount - singlecharcount, wordcount,
                totalwords, charcount, totalchars, len(subtlexwords),
                len(ccedictwords)))
    wordsknown = "\n".join(frequency_order_word(words))
    charsknown = "\n".join(frequency_order_char(chars))
    subtlexknown = "\n".join(frequency_order_word(subtlexwords))
    ccedictknown = "\n".join(frequency_order_word(ccedictwords))
    results.append(blockboxtemplate().format("Unique Entries", "wordsknown",
                                             wordsknown))
    results.append(blockboxtemplate().format("Unique Characters", "charsknown",
                                             charsknown))
    results.append(blockboxtemplate().format("SUBTLEX Words", "subtlexknown",
                                             subtlexknown))
    results.append(blockboxtemplate().format("CC-CEDICT Words", "cedictknown",
                                             ccedictknown))
Exemplo n.º 2
0
def suggesthskchars(results, chars):
    results.append("""<h4>Suggested HSK Characters not in Input</h4>""")
    for i in range(1, 7):
        charstolearn = "\n".join(frequency_order_char(hsk_chars[i] - chars))
        results.append(blockboxtemplate().format("HSK " + str(i),
                                                 "hskcharstolearn" + str(i),
                                                 charstolearn))
    foundchars = []
    for freq, char in char_frequency_ordered:
        if char not in chars and char not in hsk_chars[16]:
            foundchars.append(char)
        if len(foundchars) >= 1000:
            break
    charstext = "\n".join(frequency_order_char(foundchars))
    results.append(blockboxtemplate().format("Non-HSK",
                                             "nonhskcharstolearn" + str(i),
                                             charstext))
Exemplo n.º 3
0
def suggesthskwords(results, words):
    results.append("""<h4>Suggested HSK Words not in Input</h4>""")
    for i in range(1, 7):
        wordstolearn = "\n".join(frequency_order_word(hsk_words[i] - words))
        results.append(blockboxtemplate().format("HSK " + str(i),
                                                 "hskwordstolearn" + str(i),
                                                 wordstolearn))
    foundwords = []
    for freq, word in word_frequency_ordered:
        if word not in words and word not in hsk_words[16]:
            foundwords.append(word)
        if len(foundwords) >= 1000:
            break
    wordstext = "\n".join(frequency_order_word(foundwords))
    results.append(blockboxtemplate().format("Non-HSK",
                                             "nonhskwordstolearn" + str(i),
                                             wordstext))
Exemplo n.º 4
0
def setresultbox(title, idname, itemset, joinchar, itemname):
    setlen = len(itemset)
    if setlen == 0:
        titlewithsize = title + """<span style="font-weight: lighter; font-size: 80%; font-style: italic;"> (empty)</span>"""
    else:
        titlewithsize = title + """<span style="font-weight: lighter; font-size: 80%; font-style: italic;"> ({} {}{})</span>""".format(
            setlen, itemname, "s" if setlen > 1 else "")
    return blockboxtemplate().format(titlewithsize, idname,
                                     joinchar.join(itemset))
Exemplo n.º 5
0
def analysehskchars(results, chars, hsk_char_count):
    knownintersect = {}
    results.append("<h4>Analysis of HSK Characters in Input</h4>")
    results.append("Input contained:<ul>")
    cumulativeknown = {}
    cumulativetotal = {}
    cumulativeknown[0] = 0
    cumulativetotal[0] = 0
    numknown = {}
    numhsk = {}
    for i in range(1, 7):
        knownintersect[i] = chars & hsk_chars[i]
        numknown[i] = len(knownintersect[i])
        numhsk[i] = len(hsk_chars[i])
        percentknown = 100 * float(numknown[i]) / numhsk[i]
        cumulativeknown[i] = cumulativeknown[i - 1] + numknown[i]
        cumulativetotal[i] = cumulativetotal[i - 1] + numhsk[i]
        results.append(
            """<li>{} ({:.2f}%) of the {} HSK {} characters""".format(
                numknown[i], percentknown, numhsk[i], i))
        if i > 1 > 0:
            cumpercentknown = 100 * float(
                cumulativeknown[i]) / cumulativetotal[i]
            results.append(
                """ <i>(Cumulative: {} ({:.2f}%) of the {} HSK 1-{} characters)</i>"""
                .format(cumulativeknown[i], cumpercentknown,
                        cumulativetotal[i], i))
        results.append("</li>")
    results.append("</ul>")
    totalunique = len(chars)
    if totalunique > 0:
        numknown_nonhsk = totalunique - cumulativeknown[6]
        results.append(
            "Of the {} <b>unique</b> characters in the input:<ul>".format(
                totalunique))
        for i in range(1, 7):
            percentknown = 100 * float(numknown[i]) / totalunique
            results.append(
                """<li>{} ({:.2f}%) were HSK {} characters""".format(
                    numknown[i], percentknown, i))
            if i > 1:
                cumpercentknown = 100 * float(cumulativeknown[i]) / totalunique
                results.append(
                    """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} characters)</i>"""
                    .format(cumulativeknown[i], cumpercentknown, i))
            results.append("</li>")
        numknown_nonhsk_percent = 100 * float(numknown_nonhsk) / totalunique
        results.append(
            """<li>{} ({:.2f}%) were non-HSK characters</li>""".format(
                numknown_nonhsk, numknown_nonhsk_percent))
        results.append("</ul>")
    totalchars = sum(hsk_char_count.values())
    if totalchars == totalunique:
        results.append(
            "<p><i>Each character appeared only once in the input.</i></p>")
    else:
        cumknown = 0
        results.append(
            "Of the {} <b>total</b> characters that were input:<ul>".format(
                totalchars))
        for i in range(1, 7):
            percentknown = 100 * float(hsk_char_count[i]) / totalchars
            cumknown += hsk_char_count[i]
            results.append(
                """<li>{} ({:.2f}%) were HSK {} characters""".format(
                    hsk_char_count[i], percentknown, i))
            if i > 1:
                cumpercentknown = 100 * float(cumknown) / totalchars
                results.append(
                    """<i>(Cumulative: {} ({:.2f}%) were HSK 1-{} characters)</i>"""
                    .format(cumknown, cumpercentknown, i))
            results.append("</li>")
        num_nonhsk = totalchars - cumknown
        numknown_nonhsk_percent = 100 * float(num_nonhsk) / totalchars
        results.append(
            """<li>{} ({:.2f}%) were non-HSK characters</li>""".format(
                num_nonhsk, numknown_nonhsk_percent))
        results.append("</ul>")
    for i in range(1, 7):
        charsknown = "\n".join(frequency_order_char(knownintersect[i]))
        results.append(blockboxtemplate().format("HSK " + str(i),
                                                 "hskcharsknown" + str(i),
                                                 charsknown))
    nonhskchars = "\n".join(frequency_order_char(chars - hsk_chars[16]))
    results.append(blockboxtemplate().format("Non-HSK", "nonhskcharsknown",
                                             nonhskchars))