def group_kana(self, string): # Group up the kanji field part = None out = [] for kanji in string: if kanji in KanjiWord.SEP: # For now we ignore things past the seperator for alternate readings break elif kana.is_kana(kanji): if part is None: part = self._new_part(is_kanji=False) part['is_kanji'] = False part['base'] += kanji else: # Close any KANA parts first if part is not None: out.append(part) part = None # Add a new kanji part2 = self._new_part(is_kanji=True) part2['base'] = kanji out.append(part2) # Close any KANA parts left over if part is not None: out.append(part) return out
def load_anki_data(kanji_list): kanji_list = set(kanji_list) # Find out which kanji we actually have cards for expected = set() for kanji in Kanji.all(): if kanji.suspended: continue expected.add(kanji.kanji) # Kanji words also get to add to the whitelist actual = set() for word in Counter.all() + KanjiWord.all(): if word.suspended: continue # Add all the kanji in the word for kanji in word.kanji: # Make sure we only add kanji if kana.is_kana(kanji): continue actual.add(kanji) extra = load_extra(settings.EXTRA_DICT_KANJI) # Find which kanji we have no cards for missing = actual - expected if len(missing): message("Missing Kanji Found", ' '.join(missing)) # Notify the user of any kanji that don't have examples (no kanji-words) no_example = expected - actual if len(no_example): message("Kanji with no Examples", ' '.join(no_example)) # Notify the user of any kanji that aren't in our dictionary unknown = (expected | actual) - (kanji_list | extra) if len(unknown): message("Unknown Kanji, not in Dict:", ' '.join(unknown)) # Now we finally make our known kanji list known = (expected | actual) return known
#!/usr/bin/env python # -*- coding: UTF-8 -*- from models.kanji import Kanji from models.kanji_word import KanjiWord from utf8_helper import force_UTF8 import kana import settings if __name__ == '__main__': force_UTF8() # First we need to read out whitelist whitelist = set() for kanji in Kanji.all(): if kanji.suspended: continue whitelist.add(kanji.kanji) # Now we filter out any KanjiWords that use other kanji for kanji_word in KanjiWord.all(): fine = True for kanji in kanji_word.kanji: if kana.is_kana(kanji) and kanji not in whitelist: fine = False if fine: kanji_word.mark_suspended(False)
data[key]['words'].append((word, readings)) if __name__ == '__main__': force_UTF8() missing = {} # Now we need to find if all the readings are found for word in KanjiWord.all(): for reading in word.kanji_readings: try: kanji = Kanji.find(reading['base']) except KeyError: if kana.is_kana(reading['base']) and reading['base'] != u'ヶ': raise AnkiModel.Error(u"Kana mismatch: %s word(%s) reading(%s)" % ( reading['base'], word.kanji, word.reading )) else: # Make sure not to do the rest of the work # otherwise you'll use the previous kanji continue # raise AnkiModel.Error(u"Kanji not found, but in use: %s word(%s)" % ( # reading['base'], word.kanji # )) # Now that we have the kanji, check if this reading is used if kanji.kanji == '々': pass elif kana.all_to_hiragana(reading['reading']) in kanji.readings:
if __name__ == '__main__': force_UTF8() args = parse() # Find all the kanji that are in the deck all_kanji = set() for word in KanjiWord.all(): for kanji in word.kanji: all_kanji.add(kanji) for kanji in Kanji.all(): all_kanji.add(kanji) # Count which kanji the input data has data = Counter(unicode(sys.stdin.read())) for char, count in data.most_common(): # we don't want kana if kana.is_kana(char): del data[char] # Nor do we want kanji we know if char in all_kanji: del data[char] # Nor any non-kanji chars if not kana.is_kanji(char): del data[char] for char, count in data.most_common(args.count): print char, count
data[key]['words'].append((word, readings)) if __name__ == '__main__': force_UTF8() missing = {} # Now we need to find if all the readings are found for word in KanjiWord.all(): for reading in word.kanji_readings: try: kanji = Kanji.find(reading['base']) except KeyError: if kana.is_kana(reading['base']) and reading['base'] != u'ヶ': raise AnkiModel.Error( u"Kana mismatch: %s word(%s) reading(%s)" % (reading['base'], word.kanji, word.reading)) else: # Make sure not to do the rest of the work # otherwise you'll use the previous kanji continue # raise AnkiModel.Error(u"Kanji not found, but in use: %s word(%s)" % ( # reading['base'], word.kanji # )) # Now that we have the kanji, check if this reading is used if kanji.kanji == '々': pass elif kana.all_to_hiragana(reading['reading']) in kanji.readings: