Пример #1
0
    def prob(self, grapheme, reading, alt_reading):
        """
        Returns the probability of P(r|k), using the formula:
        P(r|k) ~ (alpha)P_raw(r|k) + (1-alpha)P(r|r*)P(r*|k).
        """
        if scripts.to_hiragana(grapheme) == scripts.to_hiragana(alt_reading):
            # Special case: where the segment is phonetic.
            return 1.0

        # We only handle entire kanji segments.
        assert scripts.script_types(grapheme) == set([scripts.Script.Kanji])

        alpha = settings.ALTERNATION_ALPHA
        assert 0 <= alpha <= 1
        try:
            rawProb = self.raw_freq_dist[grapheme].freq(alt_reading)
        except KeyError:
            rawProb = 0.0

        normalizedProb = self.normalized_freq_dist[grapheme].freq(reading)
        alternationProb = self.alternation_dist[reading].freq(alt_reading)

        result = alpha*rawProb + (1-alpha)*normalizedProb*alternationProb

        return result
Пример #2
0
def _get_kanji_readings(alignments):
    """
    Develop a set of readings for each kanji which a learner must know as part
    of this syllabus. This set may contain invalid readings, and will later
    be pruned to only valid readings.
    """
    kanji_script = scripts.Script.Kanji
    readings = {}
    for alignment in alignments:
        alignment_len = len(alignment)
        for i, (g_seg, p_seg) in enumerate(zip(alignment.g_segs,
                    alignment.p_segs)):
            if len(g_seg) > 1 or scripts.script_types(g_seg) != kanji_script:
                continue
            reading_set = readings.setdefault(g_seg, set())
            reading_set.add(p_seg)

            has_left_context = i > 0
            has_right_context = i < alignment_len - 1
            extra_variants = alternations.canonicalSegmentForms(p_seg,
                    leftContext=has_left_context,
                    rightContext=has_right_context)
            reading_set.update(extra_variants)

    return readings
Пример #3
0
def _format_alignment(alignment):
    result = []
    for g_seg, p_seg in zip(alignment.g_segs, alignment.p_segs):
        if scripts.script_types(g_seg) == scripts.Script.Kanji:
            result.append(p_seg)
        else:
            result.extend(p_seg)
    return '|'.join(result)
Пример #4
0
 def addItem(self):
     if unicode(self.input.text()).strip() != '':
         if scripts.Script.Ascii in scripts.script_types(self.input.text()):
             pass
         else:
             if not self.input.text() in self.user_list:
                 self.user_list.append(self.input.text())
                 self.appendToList(self.input.text())
     if self.enter.isChecked(): self.input.clear()
Пример #5
0
 def addItem(self):
     if unicode(self.input.text()).strip() != '':
         if scripts.Script.Ascii in scripts.script_types(self.input.text()):
             pass
         else:
             if not self.input.text() in self.user_list:
                 self.user_list.append(self.input.text())
                 self.appendToList(self.input.text())
     if self.enter.isChecked(): self.input.clear()
Пример #6
0
def get_accuracy_by_pivot_type():
    cursor = connection.cursor()
    cursor.execute("""
        SELECT
            question.pivot,
            SUM(chosen_option.is_correct) as n_correct,
            COUNT(*) as n_responses
        FROM (
            SELECT mco.question_id, mco.is_correct
            FROM drill_multiplechoiceresponse AS mcr
            INNER JOIN drill_multiplechoiceoption AS mco
            ON mcr.option_id = mco.id
        ) as chosen_option
        INNER JOIN drill_question AS question
        ON chosen_option.question_id = question.id
        WHERE question.pivot_type = "w"
        GROUP BY question.pivot
    """)
    raw_data = cursor.fetchall()
    counts = {
        'Hiragana': FreqDist(),
        'Katakana': FreqDist(),
        'Kanji': FreqDist()
    }
    complex_scripts = set([scripts.Script.Kanji, scripts.Script.Unknown])
    only_katakana = set([scripts.Script.Katakana])
    for word, n_correct, n_responses in raw_data:
        scripts_found = scripts.script_types(word)
        if scripts_found.intersection(complex_scripts):
            dist = counts['Kanji']
        elif scripts_found.intersection(only_katakana):
            dist = counts['Katakana']
        else:
            dist = counts['Hiragana']

        dist.inc(True, int(n_correct))
        dist.inc(False, int(n_responses - n_correct))

    keys = ('Hiragana', 'Katakana', 'Kanji')

    data = [(key, counts[key].freq(True)) for key in keys]

    average = FreqDist()
    for key in keys:
        average.inc(True, counts[key][True])
        average.inc(False, counts[key][False])

    data.append(('Average', average.freq(True)))

    return data
Пример #7
0
def get_accuracy_by_pivot_type():
    cursor = connection.cursor()
    cursor.execute("""
        SELECT
            question.pivot,
            SUM(chosen_option.is_correct) as n_correct,
            COUNT(*) as n_responses
        FROM (
            SELECT mco.question_id, mco.is_correct
            FROM drill_multiplechoiceresponse AS mcr
            INNER JOIN drill_multiplechoiceoption AS mco
            ON mcr.option_id = mco.id
        ) as chosen_option
        INNER JOIN drill_question AS question
        ON chosen_option.question_id = question.id
        WHERE question.pivot_type = "w"
        GROUP BY question.pivot
    """)
    raw_data = cursor.fetchall()
    counts = {'Hiragana': FreqDist(), 'Katakana': FreqDist(), 'Kanji':
        FreqDist()}
    complex_scripts = set([scripts.Script.Kanji, scripts.Script.Unknown])
    only_katakana = set([scripts.Script.Katakana])
    for word, n_correct, n_responses in raw_data:
        scripts_found = scripts.script_types(word)
        if scripts_found.intersection(complex_scripts):
            dist = counts['Kanji']
        elif scripts_found.intersection(only_katakana):
            dist = counts['Katakana']
        else:
            dist = counts['Hiragana']

        dist.inc(True, int(n_correct))
        dist.inc(False, int(n_responses - n_correct))

    keys = ('Hiragana', 'Katakana', 'Kanji')

    data = [(key, counts[key].freq(True)) for key in keys]

    average = FreqDist()
    for key in keys:
        average.inc(True, counts[key][True])
        average.inc(False, counts[key][False])

    data.append(('Average', average.freq(True)))

    return data
Пример #8
0
    def lookup(self, query):
        found = None

        if self.config.ignore_kana():
            if len(scripts.script_types(query)) is 1:
                if scripts.script_type(query) is scripts.Script.Hiragana or scripts.script_type(query) is scripts.Script.Katakana:
                    return found
        try:
            found = self.edict[query]

            if self.config.ignore_duplicates():
                if found.word in self.stats: found = None
                else: self.stats.append(found.word)
        except KeyError:
            if query not in self.missed: self.missed.append(query)
        finally:
            return found
Пример #9
0
    def update(self, response):
        "Update our error model from a user's response."
        error_dist = models.ErrorDist.objects.get(user=response.user,
                                                  tag=self.dist_name)
        question = response.question
        base_segs = question.annotation.split(u'|')
        response_segs = response.option.annotation.split(u'|')
        distractor_sets = map(
            set,
            zip(*[
                o['annotation'].split('|') for o in
                question.multiplechoicequestion.options.values('annotation')
                if o['annotation'] != response.option.annotation
            ]))
        assert len(base_segs) == len(response_segs) == len(distractor_sets)

        for base_seg, response_seg, distractor_segs in \
                    izip(base_segs, response_segs, distractor_sets):
            if scripts.script_types(base_seg) != scripts.Script.Kanji:
                continue
            sub_dist = models.ProbDist.from_query_set(
                error_dist.density.filter(condition=base_seg))
            e = settings.UPDATE_EPSILON

            try:
                m = max(imap(sub_dist.__getitem__, distractor_segs)) + e
                existing_score = sub_dist[response_seg]
            except KeyError:
                raise UpdateError(
                    u'for user %s, dist %s, response %d: no entry for %s|%s' %
                    (
                        response.user.username,
                        self.dist_name,
                        response.id,
                        response_seg,
                        base_seg,
                    ))

            if m > existing_score:
                sub_dist[response_seg] = m
                sub_dist.normalise()
                sub_dist.save_to(error_dist.density, condition=base_seg)
        return
Пример #10
0
    def lookup(self, query):
        found = None

        if self.config.ignore_kana():
            if len(scripts.script_types(query)) is 1:
                if scripts.script_type(
                        query
                ) is scripts.Script.Hiragana or scripts.script_type(
                        query) is scripts.Script.Katakana:
                    return found
        try:
            found = self.edict[query]

            if self.config.ignore_duplicates():
                if found.word in self.stats: found = None
                else: self.stats.append(found.word)
        except KeyError:
            if query not in self.missed: self.missed.append(query)
        finally:
            return found
Пример #11
0
    def update(self, response):
        "Update our error model from a user's response."
        error_dist = models.ErrorDist.objects.get(user=response.user,
                tag=self.dist_name)
        question = response.question
        base_segs = question.annotation.split(u'|')
        response_segs = response.option.annotation.split(u'|')
        distractor_sets = map(set, zip(
                *[o['annotation'].split('|')
                for o in question.multiplechoicequestion.options.values(
                        'annotation')
                if o['annotation'] != response.option.annotation]
            ))
        assert len(base_segs) == len(response_segs) == len(distractor_sets)

        for base_seg, response_seg, distractor_segs in \
                    izip(base_segs, response_segs, distractor_sets):
            if scripts.script_types(base_seg) != scripts.Script.Kanji:
                continue
            sub_dist = models.ProbDist.from_query_set(
                    error_dist.density.filter(condition=base_seg))
            e = settings.UPDATE_EPSILON

            try:
                m = max(imap(sub_dist.__getitem__, distractor_segs)) + e
                existing_score = sub_dist[response_seg]
            except KeyError:
                raise UpdateError(
                    u'for user %s, dist %s, response %d: no entry for %s|%s' % (
                            response.user.username,
                            self.dist_name,
                            response.id,
                            response_seg,
                            base_seg,
                    ))

            if m > existing_score:
                sub_dist[response_seg] = m
                sub_dist.normalise()
                sub_dist.save_to(error_dist.density, condition=base_seg)
        return
Пример #12
0
def _jpn(token):
    """Convert jpn token to phonemes."""
    from cjktools import scripts
    from cjktools.resources import kanjidic

    lkp = {}
    for fn in ["lib/data/phon/ja-Hira", "lib/data/phon/ja-Kata"]:
        lines = open(fn).readlines()
        if len(lines) == 0:
            continue
        for line in lines:
            if line.strip() == "":
                continue
            kv = line.strip().split("\t")
            if len(kv) != 2:
                print("!", kv, file=sys.stderr)
                continue
            k = kv[0].strip()
            v = kv[1].strip()
            if k not in lkp:
                lkp[k] = []
            lkp[k].append(v)

    kjd = kanjidic.Kanjidic(kanjidic_files=["lib/data/dict/ja"])
    op = ""
    segs = scripts.script_boundaries(token)
    for seg in segs:
        tipus = scripts.script_types(seg)
        if 3 in tipus:
            for ch in seg:
                if ch in kjd:
                    if len(kjd[ch].on_readings) > 0:
                        op += kjd[ch].on_readings[0]
        else:
            op += seg

    res = _maxphon(lkp, op)
    if res == "":
        return "?"
    return res
Пример #13
0
    def process_response(self, request, response):
        if response.status_code != 200:
            return response

        if not response.get('Content-Type', '').startswith('text/html'):
            return response

        content = response.content.decode('utf8')
        if not scripts.script_types(content).intersection(
                    self.japanese_scripts):
            return response

        parts = []
        for part in scripts.script_boundaries(content):
            if scripts.script_type(part) in self.japanese_scripts:
                parts.append('<span lang="ja" xml:lang="ja">%s</span>' % part)
            else:
                parts.append(part)

        response.content = u''.join(parts).encode('utf8')

        return response
Пример #14
0
def check_scripts(plain):
    return scripts.Script.Ascii in scripts.script_types(plain)
Пример #15
0
def check_scripts(plain):
    return scripts.Script.Ascii in scripts.script_types(plain)
Пример #16
0
 def _get_stimulus_class(self, stimulus):
     if scripts.script_types(stimulus) == scripts.Script.Ascii:
         return 'stimulus_roman'
     else:
         return 'stimulus_cjk'