def filterDbByMat( db, mat ): '''Assumes safe to use cached locDb''' newDb = MorphDb() for loc, ms in db.locDb( recalc=False ).iteritems(): if loc.maturity > mat: newDb.addMsL( ms, loc ) return newDb
def filterDbByMat(db, mat): '''Assumes safe to use cached locDb''' newDb = MorphDb() for loc, ms in db.locDb(recalc=False).iteritems(): if loc.maturity > mat: newDb.addMsL(ms, loc) return newDb
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ): # Load files kdb = MorphDb( cfg1('path_known') ) mdb = MorphDb( cfg1('path_mature') ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): jpn, eng = dialogueLines[i:i+2] jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn ) # get unknowns ms = getMorphemes( jpn, whitelist, blacklist ) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def mkAllDb( allDb=None ): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) if not C('enabled'): continue mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes( fieldValue ) locDb.pop( loc ) locDb[ newLoc ] = ms printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def main(): # load existing all.db mw.progress.start(label='Loading existing all.db', immediate=True) t_0 = time.time() cur = util.allDb() if cfg1('loadAllDb') else None printf('Loaded all.db in %f sec' % (time.time() - t_0)) mw.progress.finish() # update all.db allDb = mkAllDb(cur) # merge in external.db mw.progress.start(label='Merging ext.db', immediate=True) ext = MorphDb(cfg1('path_ext'), ignoreErrors=True) allDb.merge(ext) mw.progress.finish() # update notes knownDb = updateNotes(allDb) # update stats and refresh display stats.updateStats(knownDb) mw.toolbar.draw() # set global allDb util._allDb = allDb
def pre(b): from util import dbsPath # not defined until late, so don't import at top of module path = QFileDialog.getSaveFileName(caption='Save morpheme db to?', directory=dbsPath + os.sep + 'exportedMorphs.db') if not path: return return {'dbpath': str(path), 'morphDb': MorphDb()}
def pre(b): # :: Browser -> State tags, ok = QInputDialog.getText(b, 'Enter tags', 'Tags', QLineEdit.Normal, 'hasMorph') if not ok or not tags: return path = QFileDialog.getOpenFileName(caption='Open db', directory=util.dbsPath) if not path: return db = MorphDb(path) return {'b': b, 'db': db, 'tags': unicode(tags)}
def onExtractTxtFile( self ): srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath ) if not srcPath: return destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' ) if not destPath: return mat = cfg1('text file import maturity') db = MorphDb.mkFromFile( str(srcPath), mat ) if db: db.save( str(destPath) ) infoMsg( 'Extracted successfully' )
def onExtractTxtFile( self ): srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath ) if not srcPath: return destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' ) if not destPath: return mat = cfg1('text file import maturity') db = MorphDb.mkFromFile( str(srcPath), getAllMorphemizers()[self.morphemizerComboBox.currentIndex()], mat ) if db: db.save( str(destPath) ) infoMsg( 'Extracted successfully' )
def updateStats(knownDb=None): mw.progress.start(label='Updating stats', immediate=True) from morphemes import MorphDb d = {} # Load known.db and get total morphemes known if knownDb is None: knownDb = MorphDb(cfg1('path_known'), ignoreErrors=True) d['totalKnown'] = len(knownDb.db) # Load Goal.*.db dbs, get morphemes required, and compare vs known.db d['goals'] = {} goalDbPaths = glob.glob(os.path.join(cfg1('path_dbs'), 'Goal.*.db')) for path in goalDbPaths: name = os.path.basename(path)[5:][:-3] gdb = MorphDb(path) # track total unique morphemes + when weighted by frequency # NOTE: a morpheme may occur multiple times within the same sentence, but this frequency is wrt note fields numUniqueReq, numUniqueKnown, numFreqReq, numFreqKnown = 0, 0, 0, 0 for m in gdb.db.iterkeys(): freq = gdb.db.frequency(m) numUniqueReq += 1 numFreqReq += freq if m in knownDb.db: numUniqueKnown += 1 numFreqKnown += freq d['goals'][name] = { 'total': numUniqueReq, 'known': numUniqueKnown, 'freqTotal': numFreqReq, 'freqKnown': numFreqKnown } saveStats(d) mw.progress.finish() return d
def measure_readability(self, file_name, is_ass, is_srt): self.writeOutput( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ("Input", "Total Morphs", "Known Morphs", "% Known Morphs", "Total Instances", "Known Instances", "% Readability", "% Proper Nouns", "% Known Lines", "% i+1 Lines")) #filename will be clipboard if reading from clipboard log_fp.write('measure_readability %s\n' % file_name) proper_noun_count = 0 i_count = 0 line_count = 0 line_morphs = [] known_line_count = 0 iplus1_line_count = 0 known_count = 0 seen_morphs = {} known_morphs = {} source_unknown_db = MorphDb() def proc_lines(text, is_ass, is_srt): nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs text_index = -1 num_fields = 1 srt_count = 0 def parse_text(text): nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs log_fp.write('=== parse_text ===\n' + text + '\n') # print('strip',stripHTML(text)) parsed_morphs = getMorphemes(self.morphemizer, stripHTML(text)) # parsed_morphs = getMorphemes(morphemizer, text) if len(parsed_morphs) == 0: return unknown_count = 0 line_missing_morphs = set() for m in parsed_morphs: # Count morph for word report all_morphs[m] = all_morphs.get(m, 0) + 1 seen_morphs[m] = seen_morphs.get(m, 0) + 1 if m.isProperNoun(): proper_noun_count += 1 is_proper_noun = True else: is_proper_noun = False i_count += 1 if known_db.matches( m ) or is_proper_noun: # Proper nouns are easy to learn, so assume they're known. known_morphs[m] = known_morphs.get(m, 0) + 1 known_count += 1 else: unknown_db.addMorph(m, 1) source_unknown_db.addMorph(m, 1) line_missing_morphs.add(m) unknown_count += 1 line_count += 1 if unknown_count == 0: known_line_count += 1 elif unknown_count == 1: iplus1_line_count += 1 line_morphs.append(line_missing_morphs) filtered_text = '' for t in text.splitlines(): should_flush = True if is_ass: if 'Format:' in t: formats = [x.strip() for x in t[8:].split(',')] if 'Text' in formats: text_index = formats.index('Text') num_fields = len(formats) else: text_index = -1 continue elif ('Dialogue:' not in t) or (text_index < 0): continue t = t[9:].split(',', num_fields - 1) t = t[text_index] elif is_srt: srt_count += 1 if srt_count <= 2: continue elif t == '': srt_count = 0 else: should_flush = False if t != '': filtered_text += t + '\n' # Todo: This will flush every line so we can compute per-line readability, which is slower than batching lines. # Figure out how to get per-line analysis with batched lines. if should_flush: #if len(filtered_text) >= 2048: parse_text(filtered_text) filtered_text = '' parse_text(filtered_text) try: if file_name == 'clipboard': input = pyperclip.paste() else: with open(file_name.strip(), 'rt', encoding='utf-8') as f: input = f.read() input = input.replace(u'\ufeff', '') #input = [l.replace(u'\ufeff', '') for l in f.read()] proc_lines(input, is_ass, is_srt) source = Source(file_name, seen_morphs, line_morphs, source_unknown_db) known_percent = 0.0 if len( seen_morphs.keys() ) == 0 else 100.0 * len(known_morphs) / len(seen_morphs.keys()) readability = 0.0 if i_count == 0 else 100.0 * known_count / i_count proper_noun_percent = 0.0 if line_count == 0 else 100.0 * proper_noun_count / i_count line_percent = 0.0 if line_count == 0 else 100.0 * known_line_count / line_count iplus1_percent = 0.0 if line_count == 0 else 100.0 * iplus1_line_count / line_count self.writeOutput( '%s\t%d\t%d\t%0.2f\t%d\t%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n' % (source.name, len(seen_morphs), len(known_morphs), known_percent, i_count, known_count, readability, proper_noun_percent, line_percent, iplus1_percent)) # row = self.ui.readabilityTable.rowCount() # self.ui.readabilityTable.insertRow(row) # self.ui.readabilityTable.setItem(row, 0, QTableWidgetItem(source.name)) # self.ui.readabilityTable.setItem(row, 1, TableInteger(len(seen_morphs))) # self.ui.readabilityTable.setItem(row, 2, TableInteger(len(known_morphs))) # self.ui.readabilityTable.setItem(row, 3, TablePercent(known_percent)) # self.ui.readabilityTable.setItem(row, 4, TableInteger(i_count)) # self.ui.readabilityTable.setItem(row, 5, TableInteger(known_count)) # self.ui.readabilityTable.setItem(row, 6, TablePercent(readability)) # self.ui.readabilityTable.setItem(row, 7, TablePercent(proper_noun_percent)) # self.ui.readabilityTable.setItem(row, 8, TablePercent(line_percent)) # self.ui.readabilityTable.setItem(row, 9, TablePercent(iplus1_percent)) if save_study_plan: sources.append(source) except: self.writeOutput("Failed to process '%s'\n" % file_name) raise
def mkAllDb(allDb=None): t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in C('morph_fields'): try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = getMecabField(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: printf(' .mats for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: printf(' .morphs for %d[%s]' % (nid, fieldName)) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(fieldValue) locDb.pop(loc) locDb[newLoc] = ms printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def mkAllDb( allDb=None ): import config; reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar( 'select count() from notes' ) N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True ) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb( recalc=False ) # fidDb() already forces locDb recalc mw.progress.update( label='Generating all.db data' ) for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ): if i % 500 == 0: mw.progress.update( value=i ) C = partial( cfg, mid, None ) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ] if C('ignore maturity'): mats = [ 0 for mat in mats ] ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [ C('threshold_mature')+1 ] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData( fieldName, flds, mid ) except KeyError: continue except TypeError: mname = mw.col.models.get( mid )[ 'name' ] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) ) raise loc = fidDb.get( ( nid, guid, fieldName ), None ) if not loc: loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[ loc ] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) locDb[ newLoc ] = locDb.pop( loc ) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats ) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop( loc ) locDb[ newLoc ] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.') return None printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.update( value=i, label='Creating all.db object' ) allDb.clear() allDb.addFromLocDb( locDb ) if cfg1('saveDbs'): mw.progress.update( value=i, label='Saving all.db to disk' ) allDb.save( cfg1('path_all') ) printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) ) mw.progress.finish() return allDb
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Calculating frequency information') pops = [len(locs) for locs in allDb.db.values()] pops = [n for n in pops if n > 1] mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) if not C('enabled'): continue # Get all morphemes for note ms = set() for fieldName in C('morph_fields'): try: loc = fidDb[(nid, guid, fieldName)] ms.update(locDb[loc]) except KeyError: continue ms = [m for m in ms if m.pos not in C('morph_blacklist')] # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for m in ms: if m not in seenDb.db: unseens.add(m) if m not in knownDb.db: unknowns.add(m) if m not in matureDb.db: unmatures.add(m) if m not in matureDb.db and m in knownDb.db: newKnowns.add(m) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(ms), len(unseens), len(unknowns), len(unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += len(allDb.db[focusMorph]) F_k_avg = F_k / N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for m in newKnowns: locs = allDb.db[m] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) / ivl #TODO: maybe average this so it doesnt favor long sentences if any(m.pos == u'動詞' for m in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length (too little context vs long sentence) lenDiff = max(0, min(9, abs(C('optimal sentence length') - N) - 2)) tooLong = N > C('optimal sentence length') # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # determine card type compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag, badLengthTag, tooLongTag = tagNames = C( 'tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C( 'tag_alreadyKnown'), C('tag_priority'), C('tag_badLength'), C( 'tag_tooLong') if N_m == 0: # sentence comprehension card, m+0 ts = [compTag ] + [t for t in ts if t not in [vocabTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = [vocabTag ] + [t for t in ts if t not in [compTag, notReadyTag]] setField(mid, fs, C('focusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = [notReadyTag ] + [t for t in ts if t not in [compTag, vocabTag]] # set type agnostic fields setField(mid, fs, C('k+N'), u'%d' % N_k) setField(mid, fs, C('m+N'), u'%d' % N_m) setField(mid, fs, C('morphManIndex'), u'%d' % mmi) setField(mid, fs, C('unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, C('unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, C('unknownFreq'), u'%d' % F_k_avg) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if badLengthTag in ts: ts.remove(badLengthTag) if lenDiff: ts.append(badLengthTag) if tooLongTag in ts: ts.remove(tooLongTag) if tooLong: ts.append(tooLongTag) # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) TAG.register(tagNames) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb
def __init__(self, path=None, parent=None): self.db = MorphDb(path)
def loadA(self): self.aPath = self.aPathLEdit.text() self.aDb = MorphDb(path=self.aPath) if not self.db: self.db = self.aDb
def allDb(): global _allDb if _allDb is None: from morphemes import MorphDb _allDb = MorphDb(cfg1('path_all'), ignoreErrors=True) return _allDb
def mkAllDb(allDb=None): import config reload(config) t_0, db, TAG = time.time(), mw.col.db, mw.col.tags N_notes = db.scalar('select count() from notes') N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing mw.progress.start(label='Prep work for all.db creation', max=N_notes, immediate=True) if not allDb: allDb = MorphDb() fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc mw.progress.update(label='Generating all.db data') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None: continue morphemizer = getMorphemizerByName(notecfg['Morphemizer']) N_enabled_notes += 1 mats = [(0.5 if ivl == 0 and ctype == 1 else ivl) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid)] if C('ignore maturity'): mats = [0 for mat in mats] ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown') if alreadyKnownTag in ts: mats += [C('threshold_mature') + 1] for fieldName in notecfg['Fields']: try: # if doesn't have field, continue #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) ) fieldValue = extractFieldData(fieldName, flds, mid) except KeyError: continue except TypeError: mname = mw.col.models.get(mid)['name'] errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.' .format(model=mname, field=fieldName)) raise loc = fidDb.get((nid, guid, fieldName), None) if not loc: loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) if ms: #TODO: this needed? should we change below too then? #printf( ' .loc for %d[%s]' % ( nid, fieldName ) ) locDb[loc] = ms else: # mats changed -> new loc (new mats), move morphs if loc.fieldValue == fieldValue and loc.maturities != mats: #printf( ' .mats for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) locDb[newLoc] = locDb.pop(loc) # field changed -> new loc, new morphs elif loc.fieldValue != fieldValue: #printf( ' .morphs for %d[%s]' % ( nid, fieldName ) ) newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats) ms = getMorphemes(morphemizer, fieldValue, ts) locDb.pop(loc) locDb[newLoc] = ms if N_enabled_notes == 0: mw.progress.finish() errorMsg( u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.' ) return None printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0)) mw.progress.update(value=i, label='Creating all.db object') allDb.clear() allDb.addFromLocDb(locDb) if cfg1('saveDbs'): mw.progress.update(value=i, label='Saving all.db to disk') allDb.save(cfg1('path_all')) printf('Processed all %d notes + saved all.db in %f sec' % (N_notes, time.time() - t_0)) mw.progress.finish() return allDb
def loadB(self): self.bPath = self.bPathLEdit.text() self.bDb = MorphDb(path=self.bPath)
def onAnalyze(self): self.morphemizer = getMorphemizerByName(p['DEFAULT']['currmophemizer']) input_path = False # will set self.writeOutput('Using morphemizer: %s \n' % self.morphemizer.getDescription()) debug_output = False if p['DEFAULT'].getboolean('inputtype') and ~p['DEFAULT'].getboolean( 'minimized' ): # only uses fold when not minimized and inputtype is checked #TODO if certain keypress, force analyze through clipboard input_path = p['DEFAULT']['inputpath'] minimum_master_frequency = p['DEFAULT'].getint('min_master_freq') readability_target = p['DEFAULT'].getfloat('read_target') master_freq_path = p['DEFAULT']['frequencylist'] known_words_path = p['DEFAULT']['knownmorphs'] ext_morphs = p['DEFAULT']['externalmorphs'] output_path = p['DEFAULT']['outputpath'] save_frequency_list = p['DEFAULT'].getboolean('save_freqency_list') save_word_report = p['DEFAULT'].getboolean('save_word_report') save_study_plan = p['DEFAULT'].getboolean('save_study_plan') source_score_multiplier = p['DEFAULT'].getfloat( 'SourceScoreMultiplier') source_score_power = p['DEFAULT'].getfloat('SourceScorePower') proper_nouns_known = p['DEFAULT'].getboolean('ProperNounsAlreadyKnown') fill_all_morphs_in_plan = p['DEFAULT'].getboolean( 'FillAllMorphsInStudyPlan') if not os.path.exists(output_path): try: os.makedirs(output_path) except OSError as e: if e.errno != errno.EEXIST: raise frequency_list_path = os.path.normpath(output_path + '/frequency.txt') word_report_path = os.path.normpath(output_path + '/word_freq_report.txt') study_plan_path = os.path.normpath(output_path + '/study_plan.txt') readability_log_path = os.path.normpath(output_path + '/readability_log.txt') log_fp = open(readability_log_path, 'wt', encoding='utf-8') master_db = MorphDb() unknown_db = MorphDb() master_total_instances = 0 master_current_score = 0 all_morphs = {} if os.path.isfile(master_freq_path): with io.open(master_freq_path, encoding='utf-8-sig') as csvfile: csvreader = csv.reader(csvfile, delimiter="\t") for row in csvreader: try: instances = int(row[0]) m = Morpheme(row[1], row[2], row[2], row[3], row[4], row[5]) master_db.addMorph(m, instances) master_total_instances += instances except: pass self.writeOutput("Master morphs loaded: K %d V %d\n" % (master_db.getTotalNormMorphs(), master_db.getTotalVariationMorphs())) else: self.writeOutput("Master frequency file '%s' not found.\n" % master_freq_path) minimum_master_frequency = 0 if os.path.isfile(known_words_path): known_db = MorphDb(known_words_path, ignoreErrors=True) total_k = len(known_db.groups) total_v = len(known_db.db) self.writeOutput("Known morphs loaded: K %d V %d\n" % (total_k, total_v)) else: self.writeOutput("Known words DB '%s' not found\n" % known_words_path) known_db = MorphDb() self.known_db = known_db if master_total_instances > 0: master_current_score = 0 for ms in master_db.db.values(): for m, c in ms.items(): if known_db.matches(m): master_current_score += c[0] c[1] = True # mark matched self.writeOutput( "\n[Current master frequency readability] %0.02f\n" % (master_current_score * 100.0 / master_total_instances)) sources = [] def measure_readability(self, file_name, is_ass, is_srt): self.writeOutput( '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % ("Input", "Total Morphs", "Known Morphs", "% Known Morphs", "Total Instances", "Known Instances", "% Readability", "% Proper Nouns", "% Known Lines", "% i+1 Lines")) #filename will be clipboard if reading from clipboard log_fp.write('measure_readability %s\n' % file_name) proper_noun_count = 0 i_count = 0 line_count = 0 line_morphs = [] known_line_count = 0 iplus1_line_count = 0 known_count = 0 seen_morphs = {} known_morphs = {} source_unknown_db = MorphDb() def proc_lines(text, is_ass, is_srt): nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs text_index = -1 num_fields = 1 srt_count = 0 def parse_text(text): nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs log_fp.write('=== parse_text ===\n' + text + '\n') # print('strip',stripHTML(text)) parsed_morphs = getMorphemes(self.morphemizer, stripHTML(text)) # parsed_morphs = getMorphemes(morphemizer, text) if len(parsed_morphs) == 0: return unknown_count = 0 line_missing_morphs = set() for m in parsed_morphs: # Count morph for word report all_morphs[m] = all_morphs.get(m, 0) + 1 seen_morphs[m] = seen_morphs.get(m, 0) + 1 if m.isProperNoun(): proper_noun_count += 1 is_proper_noun = True else: is_proper_noun = False i_count += 1 if known_db.matches( m ) or is_proper_noun: # Proper nouns are easy to learn, so assume they're known. known_morphs[m] = known_morphs.get(m, 0) + 1 known_count += 1 else: unknown_db.addMorph(m, 1) source_unknown_db.addMorph(m, 1) line_missing_morphs.add(m) unknown_count += 1 line_count += 1 if unknown_count == 0: known_line_count += 1 elif unknown_count == 1: iplus1_line_count += 1 line_morphs.append(line_missing_morphs) filtered_text = '' for t in text.splitlines(): should_flush = True if is_ass: if 'Format:' in t: formats = [x.strip() for x in t[8:].split(',')] if 'Text' in formats: text_index = formats.index('Text') num_fields = len(formats) else: text_index = -1 continue elif ('Dialogue:' not in t) or (text_index < 0): continue t = t[9:].split(',', num_fields - 1) t = t[text_index] elif is_srt: srt_count += 1 if srt_count <= 2: continue elif t == '': srt_count = 0 else: should_flush = False if t != '': filtered_text += t + '\n' # Todo: This will flush every line so we can compute per-line readability, which is slower than batching lines. # Figure out how to get per-line analysis with batched lines. if should_flush: #if len(filtered_text) >= 2048: parse_text(filtered_text) filtered_text = '' parse_text(filtered_text) try: if file_name == 'clipboard': input = pyperclip.paste() else: with open(file_name.strip(), 'rt', encoding='utf-8') as f: input = f.read() input = input.replace(u'\ufeff', '') #input = [l.replace(u'\ufeff', '') for l in f.read()] proc_lines(input, is_ass, is_srt) source = Source(file_name, seen_morphs, line_morphs, source_unknown_db) known_percent = 0.0 if len( seen_morphs.keys() ) == 0 else 100.0 * len(known_morphs) / len(seen_morphs.keys()) readability = 0.0 if i_count == 0 else 100.0 * known_count / i_count proper_noun_percent = 0.0 if line_count == 0 else 100.0 * proper_noun_count / i_count line_percent = 0.0 if line_count == 0 else 100.0 * known_line_count / line_count iplus1_percent = 0.0 if line_count == 0 else 100.0 * iplus1_line_count / line_count self.writeOutput( '%s\t%d\t%d\t%0.2f\t%d\t%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n' % (source.name, len(seen_morphs), len(known_morphs), known_percent, i_count, known_count, readability, proper_noun_percent, line_percent, iplus1_percent)) # row = self.ui.readabilityTable.rowCount() # self.ui.readabilityTable.insertRow(row) # self.ui.readabilityTable.setItem(row, 0, QTableWidgetItem(source.name)) # self.ui.readabilityTable.setItem(row, 1, TableInteger(len(seen_morphs))) # self.ui.readabilityTable.setItem(row, 2, TableInteger(len(known_morphs))) # self.ui.readabilityTable.setItem(row, 3, TablePercent(known_percent)) # self.ui.readabilityTable.setItem(row, 4, TableInteger(i_count)) # self.ui.readabilityTable.setItem(row, 5, TableInteger(known_count)) # self.ui.readabilityTable.setItem(row, 6, TablePercent(readability)) # self.ui.readabilityTable.setItem(row, 7, TablePercent(proper_noun_percent)) # self.ui.readabilityTable.setItem(row, 8, TablePercent(line_percent)) # self.ui.readabilityTable.setItem(row, 9, TablePercent(iplus1_percent)) if save_study_plan: sources.append(source) except: self.writeOutput("Failed to process '%s'\n" % file_name) raise def accepted_filetype(filename): return filename.lower().endswith(('.srt', '.ass', '.txt')) list_of_files = None #################### if os.path.isfile(input_path) or os.path.isdir(input_path): list_of_files = list() print('getting info from files!') ################### if list_of_files is not list(): for (dirpath, _, filenames) in os.walk(input_path): list_of_files += [ os.path.join(dirpath, filename) for filename in filenames if accepted_filetype(filename) ] # self.ui.readabilityTable.clear() # self.ui.readabilityTable.setRowCount(0) # self.ui.readabilityTable.setColumnCount(10) # self.ui.readabilityTable.setHorizontalHeaderLabels([ # "Input", "Total\nMorphs", "Known\nMorphs", "Known\nMorphs %", "Total\nInstances", "Known\nInstances", # "Morph\nReadability %", "Proper\nNoun %", "Line\nReadability %", "i+1\nLines %"]) if len(list_of_files) > 0: # mw.progress.start( label='Measuring readability', max=len(list_of_files), immediate=True ) for n, file_path in enumerate( sorted(list_of_files, key=natural_keys)): # mw.progress.update(value=n, label='Parsing (%d/%d) %s' % ( # n + 1, len(list_of_files), os.path.basename(file_path))) #TODO ADD PROGRESS BAR if os.path.isfile(file_path): is_ass = os.path.splitext( file_path)[1].lower() == '.ass' is_srt = os.path.splitext( file_path)[1].lower() == '.srt' measure_readability(self, file_path, is_ass, is_srt) # mw.progress.finish() else: self.writeOutput('\nNo files found to process.\n') return else: measure_readability(self, 'clipboard', 0, 0) # for clipboard run # self.ui.readabilityTable.resizeColumnsToContents() if save_word_report: self.writeOutput("\n[Saving word report to '%s'...]\n" % word_report_path) with open(word_report_path, 'wt', encoding='utf-8') as f: last_count = 0 morph_idx = 0 group_idx = 0 morph_total = 0.0 all_morphs_count = sum(n for n in all_morphs.values()) for m in sorted(all_morphs.items(), key=operator.itemgetter(1), reverse=True): if m[1] != last_count: last_count = m[1] group_idx += 1 morph_idx += 1 morph_delta = 100.0 * m[1] / all_morphs_count morph_total += morph_delta print( '%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%0.8f\t%0.8f matches %d' % (m[1], m[0].norm, m[0].base, m[0].read, m[0].pos, m[0].subPos, group_idx, morph_idx, morph_delta, morph_total, known_db.matches(m[0])), file=f) learned_tot = 0 learned_morphs = [] all_missing_morphs = [] def get_line_readability(show, known_db): known_lines = 0 for line_morphs in show.line_morphs: has_unknowns = False for m in line_morphs: if known_db.matches(m): continue has_unknowns = True if not has_unknowns: known_lines += 1 line_readability = 0.0 if known_lines == 0 else 100.0 * known_lines / len( show.line_morphs) return line_readability if save_study_plan: self.writeOutput("\n[Saving Study Plan to '%s'...]\n" % study_plan_path) with open(study_plan_path, 'wt', encoding='utf-8') as f: # self.ui.studyPlanTable.clear() # self.ui.studyPlanTable.setRowCount(0) # self.ui.studyPlanTable.setColumnCount(7) # self.ui.studyPlanTable.setHorizontalHeaderLabels([ # "Input", "To Study\nMorphs ", "Cummulative\nMorphs", "Old Morph\nReadability %", "New Morph\nReadability %", # "Old Line\nReadability %", "New Line\nReadability %"]) # mw.progress.start( label='Building study plan', max=len(sources), immediate=True ) for n, s in enumerate(sources): # mw.progress.update( value=n, label='Processing (%d/%d) %s' % (n+1, len(sources), os.path.basename(s.name)) ) # if debug_output: f.write('Processing %s\n' % s.name) known_i = 0 seen_i = 0 learned_m = 0 missing_morphs = [] old_line_readability = get_line_readability(s, known_db) for m in s.morphs.items(): seen_i += m[1] morph = m[0] if known_db.matches(morph) or (proper_nouns_known and morph.isProperNoun()): known_i += m[1] else: source_unknown_count = s.unknown_db.getFuzzyCount( morph, known_db) unknown_count = unknown_db.getFuzzyCount( morph, known_db) master_count = master_db.getFuzzyCount( morph, known_db) source_count = source_unknown_count + unknown_count score = pow( source_count, source_score_power ) * source_score_multiplier + master_count missing_morphs.append( (m[0], m[1], source_unknown_count, unknown_count, master_count, score)) if debug_output: f.write( ' missing: ' + m[0].show() + '\t[score %d ep_freq %d all_freq %d master_freq %d]\n' % (score, source_unknown_count, unknown_count, master_count)) all_missing_morphs += missing_morphs readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i old_readability = readability learned_this_source = [] for m in sorted(missing_morphs, key=operator.itemgetter(5), reverse=True): if readability >= readability_target: if debug_output: f.write(' readability target reached\n') break if known_db.matches(m[0]): if debug_output: f.write(' known: %s\n' % m[0].show()) continue if m[4] < minimum_master_frequency: if debug_output: f.write( ' low score: %s [score %d ep_freq %d all_freq %d master_freq %d]\n' % (m[0].show(), m[5], m[2], m[3], m[4])) continue learned_morphs.append(m) learned_this_source.append(m) known_i += s.unknown_db.getFuzzyCount(m[0], known_db) learned_m += 1 readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i known_db.addMLs1(m[0], set()) new_line_readability = get_line_readability(s, known_db) learned_tot += learned_m source_str = "'%s' study goal: (%3d/%4d) morph readability: %0.2f -> %0.2f line readabiltiy: %0.2f -> %0.2f\n" % ( s.name, learned_m, learned_tot, old_readability, readability, old_line_readability, new_line_readability) self.writeOutput(source_str) f.write(source_str) # row = self.ui.studyPlanTable.rowCount() # self.ui.studyPlanTable.insertRow(row) # self.ui.studyPlanTable.setItem(row, 0, QTableWidgetItem(s.name)) # self.ui.studyPlanTable.setItem(row, 1, TableInteger(learned_m)) # self.ui.studyPlanTable.setItem(row, 2, TableInteger(learned_tot)) # self.ui.studyPlanTable.setItem(row, 3, TablePercent(old_readability)) # self.ui.studyPlanTable.setItem(row, 4, TablePercent(readability)) # self.ui.studyPlanTable.setItem(row, 5, TablePercent(old_line_readability)) # self.ui.studyPlanTable.setItem(row, 6, TablePercent(new_line_readability)) for m in learned_this_source: f.write( '\t' + m[0].show() + '\t[score %d ep_freq %d all_freq %d master_freq %d]\n' % (m[5], m[2], m[3], m[4])) # self.ui.studyPlanTable.resizeColumnsToContents() # mw.progress.finish() if save_frequency_list: self.writeOutput("\n[Saving frequency list to '%s'...]\n" % frequency_list_path) with open(frequency_list_path, 'wt', encoding='utf-8') as f: unique_set = set() # First output morphs according to the plan. for m in learned_morphs: if m[0].base in unique_set: continue unique_set.add(m[0].base) print( m[0].base + '\t[score %d ep_freq %d all_freq %d master_freq %d]' % (m[5], m[2], m[3], m[4]), file=f) # Followed by all remaining morphs sorted by score. if fill_all_morphs_in_plan: for m in sorted(all_missing_morphs, key=operator.itemgetter(5), reverse=True): if (m[0].base in unique_set): continue if m[4] < minimum_master_frequency: continue unique_set.add(m[0].base) print( m[0].base + '\t[score %d ep_freq %d all_freq %d master_freq %d]' % (m[5], m[2], m[3], m[4]), file=f) if master_total_instances > 0: master_score = 0 for ms in master_db.db.values(): for m, c in ms.items(): if known_db.matches(m): master_score += c[0] c[1] = True # mark matched self.writeOutput( "\n[New master frequency readability] %0.02f -> %0.02f\n" % (master_current_score * 100.0 / master_total_instances, master_score * 100.0 / master_total_instances))
def updateNotes(allDb): t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags ds, nid2mmi = [], {} N_notes = db.scalar('select count() from notes') mw.progress.start(label='Updating data', max=N_notes, immediate=True) fidDb = allDb.fidDb() locDb = allDb.locDb(recalc=False) # fidDb() already forces locDb recalc # read tag names compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg( 'Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg( 'Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg( 'Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong') TAG.register(tagNames) badLengthTag = jcfg2().get('Tag_BadLength') # handle secondary databases mw.progress.update(label='Creating seen/known/mature from all.db') seenDb = filterDbByMat(allDb, cfg1('threshold_seen')) knownDb = filterDbByMat(allDb, cfg1('threshold_known')) matureDb = filterDbByMat(allDb, cfg1('threshold_mature')) mw.progress.update(label='Loading priority.db') priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db if cfg1('saveDbs'): mw.progress.update(label='Saving seen/known/mature dbs') seenDb.save(cfg1('path_seen')) knownDb.save(cfg1('path_known')) matureDb.save(cfg1('path_mature')) mw.progress.update(label='Updating notes') for i, (nid, mid, flds, guid, tags) in enumerate( db.execute('select id, mid, flds, guid, tags from notes')): if i % 500 == 0: mw.progress.update(value=i) C = partial(cfg, mid, None) note = mw.col.getNote(nid) notecfg = getFilter(note) if notecfg is None or not notecfg['Modify']: continue # Get all morphemes for note morphemes = set() for fieldName in notecfg['Fields']: try: loc = fidDb[(nid, guid, fieldName)] morphemes.update(locDb[loc]) except KeyError: continue # Determine un-seen/known/mature and i+N unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set() for morpheme in morphemes: if morpheme not in seenDb.db: unseens.add(morpheme) if morpheme not in knownDb.db: unknowns.add(morpheme) if morpheme not in matureDb.db: unmatures.add(morpheme) if morpheme not in matureDb.db and morpheme in knownDb.db: newKnowns.add(morpheme) # Determine MMI - Morph Man Index N, N_s, N_k, N_m = len(morphemes), len(unseens), len(unknowns), len( unmatures) # Bail early for lite update if N_k > 2 and C('only update k+2 and below'): continue # average frequency of unknowns (ie. how common the word is within your collection) F_k = 0 for focusMorph in unknowns: # focusMorph used outside loop F_k += allDb.frequency(focusMorph) F_k_avg = F_k // N_k if N_k > 0 else F_k usefulness = F_k_avg # add bonus for morphs in priority.db isPriority = False for focusMorph in unknowns: if focusMorph in priorityDb: isPriority = True usefulness += C('priority.db weight') # add bonus for studying recent learned knowns (reinforce) for morpheme in newKnowns: locs = allDb.db[morpheme] if locs: ivl = min(1, max(loc.maturity for loc in locs)) usefulness += C( 'reinforce new vocab weight' ) // ivl #TODO: maybe average this so it doesnt favor long sentences if any(morpheme.pos == u'動詞' for morpheme in unknowns): #FIXME: this isn't working??? usefulness += C('verb bonus') usefulness = 999 - min(999, usefulness) # difference from optimal length range (too little context vs long sentence) lenDiffRaw = min(N - C('min good sentence length'), max(0, N - C('max good sentence length'))) lenDiff = min(9, abs(lenDiffRaw)) # calculate mmi mmi = 10000 * N_k + 1000 * lenDiff + usefulness if C('set due based on mmi'): nid2mmi[nid] = mmi # Fill in various fields/tags on the note based on cfg ts, fs = TAG.split(tags), splitFields(flds) # clear any 'special' tags, the appropriate will be set in the next few lines ts = [ t for t in ts if t not in [notReadyTag, compTag, vocabTag, freshTag] ] # determine card type if N_m == 0: # sentence comprehension card, m+0 ts = ts + [compTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_k == 1: # new vocab card, k+1 ts = ts + [vocabTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % focusMorph.base) elif N_k > 1: # M+1+ and K+2+ ts = ts + [notReadyTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') elif N_m == 1: # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'%s' % list(unmatures)[0].base) else: # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme ts = ts + [freshTag] setField(mid, fs, jcfg('Field_FocusMorph'), u'') # set type agnostic fields setField(mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k) setField(mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m) setField(mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi) setField(mid, fs, jcfg('Field_Unknowns'), u', '.join(u.base for u in unknowns)) setField(mid, fs, jcfg('Field_Unmatures'), u', '.join(u.base for u in unmatures)) setField(mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg) # remove deprecated tag if badLengthTag is not None and badLengthTag in ts: ts.remove(badLengthTag) # other tags if priorityTag in ts: ts.remove(priorityTag) if isPriority: ts.append(priorityTag) if tooShortTag in ts: ts.remove(tooShortTag) if lenDiffRaw < 0: ts.append(tooShortTag) if tooLongTag in ts: ts.remove(tooLongTag) if lenDiffRaw > 0: ts.append(tooLongTag) # remove unnecessary tags if not jcfg('Option_SetNotRequiredTags'): unnecessary = [priorityTag, tooShortTag, tooLongTag] ts = [tag for tag in ts if tag not in unnecessary] # update sql db tags_ = TAG.join(TAG.canonify(ts)) flds_ = joinFields(fs) if flds != flds_ or tags != tags_: # only update notes that have changed csum = fieldChecksum(fs[0]) sfld = stripHTML(fs[getSortFieldIndex(mid)]) ds.append({ 'now': now, 'tags': tags_, 'flds': flds_, 'sfld': sfld, 'csum': csum, 'usn': mw.col.usn(), 'nid': nid }) mw.progress.update(value=i, label='Updating anki database...') mw.col.db.executemany( 'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid', ds) # Now reorder new cards based on MMI mw.progress.update(value=i, label='Updating new card ordering...') ds = [] # "type = 0": new cards # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type] # "type = 2": review cards for (cid, nid, due) in db.execute('select id, nid, due from cards where type = 0'): if nid in nid2mmi: # owise it was disabled due_ = nid2mmi[nid] if due != due_: # only update cards that have changed ds.append({ 'now': now, 'due': due_, 'usn': mw.col.usn(), 'cid': cid }) mw.col.db.executemany( 'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds) mw.reset() printf('Updated notes in %f sec' % (time.time() - t_0)) mw.progress.finish() return knownDb
def loadB(self): self.bPath = self.dbBPath.text() self.bDb = MorphDb(path=self.bPath)