Python MorphDb 예제들, morphemes.MorphDb Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: btmmzy/JapaneseStudy

def filterDbByMat( db, mat ):
    '''Assumes safe to use cached locDb'''
    newDb = MorphDb()
    for loc, ms in db.locDb( recalc=False ).iteritems():
        if loc.maturity > mat:
            newDb.addMsL( ms, loc )
    return newDb

예제 #2

0

파일 보기

def filterDbByMat(db, mat):
    '''Assumes safe to use cached locDb'''
    newDb = MorphDb()
    for loc, ms in db.locDb(recalc=False).iteritems():
        if loc.maturity > mat:
            newDb.addMsL(ms, loc)
    return newDb

예제 #3

0

파일 보기

파일: adaptiveSubs.py 프로젝트: whiteskulleton/JapaneseStudy

def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ):
    # Load files
    kdb = MorphDb( cfg1('path_known') )
    mdb = MorphDb( cfg1('path_mature') )
    subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines()

    # Get dueling subs
    dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ]
    header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ]
    assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange( 0, len( dialogueLines ), 2 ):
        jpn, eng = dialogueLines[i:i+2]
        jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn )

        # get unknowns
        ms = getMorphemes( jpn, whitelist, blacklist )
        unknowns, N_k = getNotInDb( ms, kdb.db )
        unmatures, N_m = getNotInDb( ms, mdb.db )
        d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures }

        if N_m == 0:
            lines.append( pre + matureFmt % d )
        elif N_k == 0:
            lines.append( pre + knownFmt % d )
        else:
            lines.append( pre + unknownFmt % d )

    outFile = codecs.open( outputSubsPath, 'w', 'utf-8' )
    outFile.write( u''.join( header ) )
    outFile.write( u'\n'.join( lines ) )
    outFile.close()

예제 #4

0

파일 보기

파일: main.py 프로젝트: btmmzy/JapaneseStudy

def mkAllDb( allDb=None ):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )
        if not C('enabled'): continue
        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        ts, alreadyKnownTag = TAG.split( tags ), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in C('morph_fields'):
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes( fieldValue )
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes( fieldValue )
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms
    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb

예제 #5

0

파일 보기

def main():
    # load existing all.db
    mw.progress.start(label='Loading existing all.db', immediate=True)
    t_0 = time.time()
    cur = util.allDb() if cfg1('loadAllDb') else None
    printf('Loaded all.db in %f sec' % (time.time() - t_0))
    mw.progress.finish()

    # update all.db
    allDb = mkAllDb(cur)

    # merge in external.db
    mw.progress.start(label='Merging ext.db', immediate=True)
    ext = MorphDb(cfg1('path_ext'), ignoreErrors=True)
    allDb.merge(ext)
    mw.progress.finish()

    # update notes
    knownDb = updateNotes(allDb)

    # update stats and refresh display
    stats.updateStats(knownDb)
    mw.toolbar.draw()

    # set global allDb
    util._allDb = allDb

예제 #6

0

파일 보기

파일: extractMorphemes.py 프로젝트: guluarte/MorphMan

def pre(b):
    from util import dbsPath  # not defined until late, so don't import at top of module
    path = QFileDialog.getSaveFileName(caption='Save morpheme db to?',
                                       directory=dbsPath + os.sep +
                                       'exportedMorphs.db')
    if not path: return
    return {'dbpath': str(path), 'morphDb': MorphDb()}

예제 #7

0

파일 보기

파일: massTagger.py 프로젝트: NinKenDo64/Jieba-Morph

def pre(b):  # :: Browser -> State
    tags, ok = QInputDialog.getText(b, 'Enter tags', 'Tags', QLineEdit.Normal,
                                    'hasMorph')
    if not ok or not tags: return
    path = QFileDialog.getOpenFileName(caption='Open db',
                                       directory=util.dbsPath)
    if not path: return
    db = MorphDb(path)
    return {'b': b, 'db': db, 'tags': unicode(tags)}

예제 #8

0

파일 보기

파일: manager.py 프로젝트: Fivapr/JapaneseStudy

 def onExtractTxtFile( self ):
     srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath )
     if not srcPath: return
     destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' )
     if not destPath: return
     mat = cfg1('text file import maturity')
     db = MorphDb.mkFromFile( str(srcPath), mat )
     if db:
         db.save( str(destPath) )
         infoMsg( 'Extracted successfully' )

예제 #9

0

파일 보기

파일: manager.py 프로젝트: guluarte/MorphMan

 def onExtractTxtFile( self ):
     srcPath = QFileDialog.getOpenFileName( caption='Text file to extract from?', directory=dbsPath )
     if not srcPath: return
     destPath = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=dbsPath + os.sep + 'textFile.db' )
     if not destPath: return
     mat = cfg1('text file import maturity')
     db = MorphDb.mkFromFile( str(srcPath), getAllMorphemizers()[self.morphemizerComboBox.currentIndex()], mat )
     if db:
         db.save( str(destPath) )
         infoMsg( 'Extracted successfully' )

예제 #10

0

파일 보기

def updateStats(knownDb=None):
    mw.progress.start(label='Updating stats', immediate=True)

    from morphemes import MorphDb
    d = {}

    # Load known.db and get total morphemes known
    if knownDb is None:
        knownDb = MorphDb(cfg1('path_known'), ignoreErrors=True)

    d['totalKnown'] = len(knownDb.db)

    # Load Goal.*.db dbs, get morphemes required, and compare vs known.db
    d['goals'] = {}
    goalDbPaths = glob.glob(os.path.join(cfg1('path_dbs'), 'Goal.*.db'))

    for path in goalDbPaths:
        name = os.path.basename(path)[5:][:-3]
        gdb = MorphDb(path)

        # track total unique morphemes + when weighted by frequency
        # NOTE: a morpheme may occur multiple times within the same sentence, but this frequency is wrt note fields
        numUniqueReq, numUniqueKnown, numFreqReq, numFreqKnown = 0, 0, 0, 0
        for m in gdb.db.iterkeys():
            freq = gdb.db.frequency(m)
            numUniqueReq += 1
            numFreqReq += freq
            if m in knownDb.db:
                numUniqueKnown += 1
                numFreqKnown += freq

        d['goals'][name] = {
            'total': numUniqueReq,
            'known': numUniqueKnown,
            'freqTotal': numFreqReq,
            'freqKnown': numFreqKnown
        }

    saveStats(d)
    mw.progress.finish()
    return d

예제 #11

0

파일 보기

파일: readability.py 프로젝트: wsavoie/Manabi

        def measure_readability(self, file_name, is_ass, is_srt):
            self.writeOutput(
                '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                ("Input", "Total Morphs", "Known Morphs", "% Known Morphs",
                 "Total Instances", "Known Instances", "% Readability",
                 "% Proper Nouns", "% Known Lines", "% i+1 Lines"))

            #filename will be clipboard if reading from clipboard
            log_fp.write('measure_readability %s\n' % file_name)

            proper_noun_count = 0
            i_count = 0
            line_count = 0
            line_morphs = []
            known_line_count = 0
            iplus1_line_count = 0
            known_count = 0
            seen_morphs = {}
            known_morphs = {}
            source_unknown_db = MorphDb()

            def proc_lines(text, is_ass, is_srt):
                nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                text_index = -1
                num_fields = 1
                srt_count = 0

                def parse_text(text):
                    nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                    nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                    log_fp.write('=== parse_text ===\n' + text + '\n')
                    # print('strip',stripHTML(text))
                    parsed_morphs = getMorphemes(self.morphemizer,
                                                 stripHTML(text))
                    # parsed_morphs = getMorphemes(morphemizer, text)
                    if len(parsed_morphs) == 0:
                        return

                    unknown_count = 0
                    line_missing_morphs = set()
                    for m in parsed_morphs:
                        # Count morph for word report
                        all_morphs[m] = all_morphs.get(m, 0) + 1
                        seen_morphs[m] = seen_morphs.get(m, 0) + 1

                        if m.isProperNoun():
                            proper_noun_count += 1
                            is_proper_noun = True
                        else:
                            is_proper_noun = False

                        i_count += 1
                        if known_db.matches(
                                m
                        ) or is_proper_noun:  # Proper nouns are easy to learn, so assume they're known.
                            known_morphs[m] = known_morphs.get(m, 0) + 1
                            known_count += 1
                        else:
                            unknown_db.addMorph(m, 1)
                            source_unknown_db.addMorph(m, 1)
                            line_missing_morphs.add(m)
                            unknown_count += 1
                    line_count += 1
                    if unknown_count == 0:
                        known_line_count += 1
                    elif unknown_count == 1:
                        iplus1_line_count += 1
                    line_morphs.append(line_missing_morphs)

                filtered_text = ''
                for t in text.splitlines():
                    should_flush = True
                    if is_ass:
                        if 'Format:' in t:
                            formats = [x.strip() for x in t[8:].split(',')]
                            if 'Text' in formats:
                                text_index = formats.index('Text')
                                num_fields = len(formats)
                            else:
                                text_index = -1
                            continue
                        elif ('Dialogue:' not in t) or (text_index < 0):
                            continue
                        t = t[9:].split(',', num_fields - 1)
                        t = t[text_index]
                    elif is_srt:
                        srt_count += 1
                        if srt_count <= 2:
                            continue
                        elif t == '':
                            srt_count = 0
                        else:
                            should_flush = False

                    if t != '':
                        filtered_text += t + '\n'

                    # Todo: This will flush every line so we can compute per-line readability, which is slower than batching lines.
                    #       Figure out how to get per-line analysis with batched lines.
                    if should_flush:
                        #if len(filtered_text) >= 2048:
                        parse_text(filtered_text)
                        filtered_text = ''

                parse_text(filtered_text)

            try:
                if file_name == 'clipboard':
                    input = pyperclip.paste()
                else:
                    with open(file_name.strip(), 'rt', encoding='utf-8') as f:
                        input = f.read()

                input = input.replace(u'\ufeff', '')

                #input = [l.replace(u'\ufeff', '') for l in f.read()]
                proc_lines(input, is_ass, is_srt)
                source = Source(file_name, seen_morphs, line_morphs,
                                source_unknown_db)
                known_percent = 0.0 if len(
                    seen_morphs.keys()
                ) == 0 else 100.0 * len(known_morphs) / len(seen_morphs.keys())
                readability = 0.0 if i_count == 0 else 100.0 * known_count / i_count
                proper_noun_percent = 0.0 if line_count == 0 else 100.0 * proper_noun_count / i_count
                line_percent = 0.0 if line_count == 0 else 100.0 * known_line_count / line_count
                iplus1_percent = 0.0 if line_count == 0 else 100.0 * iplus1_line_count / line_count

                self.writeOutput(
                    '%s\t%d\t%d\t%0.2f\t%d\t%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n' %
                    (source.name, len(seen_morphs), len(known_morphs),
                     known_percent, i_count, known_count, readability,
                     proper_noun_percent, line_percent, iplus1_percent))
                # row = self.ui.readabilityTable.rowCount()
                # self.ui.readabilityTable.insertRow(row)
                # self.ui.readabilityTable.setItem(row, 0, QTableWidgetItem(source.name))
                # self.ui.readabilityTable.setItem(row, 1, TableInteger(len(seen_morphs)))
                # self.ui.readabilityTable.setItem(row, 2, TableInteger(len(known_morphs)))
                # self.ui.readabilityTable.setItem(row, 3, TablePercent(known_percent))
                # self.ui.readabilityTable.setItem(row, 4, TableInteger(i_count))
                # self.ui.readabilityTable.setItem(row, 5, TableInteger(known_count))
                # self.ui.readabilityTable.setItem(row, 6, TablePercent(readability))
                # self.ui.readabilityTable.setItem(row, 7, TablePercent(proper_noun_percent))
                # self.ui.readabilityTable.setItem(row, 8, TablePercent(line_percent))
                # self.ui.readabilityTable.setItem(row, 9, TablePercent(iplus1_percent))

                if save_study_plan:
                    sources.append(source)
            except:
                self.writeOutput("Failed to process '%s'\n" % file_name)
                raise

예제 #12

0

파일 보기

def mkAllDb(allDb=None):
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)
        if not C('enabled'): continue

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), C('tag_alreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in C('morph_fields'):
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = getMecabField(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(fieldValue)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    printf('    .mats for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    printf('    .morphs for %d[%s]' % (nid, fieldName))
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(fieldValue)
                    locDb.pop(loc)
                    locDb[newLoc] = ms
    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb

예제 #13

0

파일 보기

파일: main.py 프로젝트: ChangSpivey/MorphMan

def mkAllDb( allDb=None ):
    import config; reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar( 'select count() from notes' )
    N_enabled_notes = 0 # for providing an error message if there is no note that is used for processing
    mw.progress.start( label='Prep work for all.db creation', max=N_notes, immediate=True )

    if not allDb: allDb = MorphDb()
    fidDb   = allDb.fidDb()
    locDb   = allDb.locDb( recalc=False )   # fidDb() already forces locDb recalc

    mw.progress.update( label='Generating all.db data' )
    for i,( nid, mid, flds, guid, tags ) in enumerate( db.execute( 'select id, mid, flds, guid, tags from notes' ) ):
        if i % 500 == 0:    mw.progress.update( value=i )
        C = partial( cfg, mid, None )

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [ ( 0.5 if ivl == 0 and ctype == 1 else ivl ) for ivl, ctype in db.execute( 'select ivl, type from cards where nid = :nid', nid=nid ) ]
        if C('ignore maturity'):
            mats = [ 0 for mat in mats ]
        ts, alreadyKnownTag = TAG.split( tags ), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [ C('threshold_mature')+1 ]

        for fieldName in notecfg['Fields']:
            try: # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData( fieldName, flds, mid )
            except KeyError: continue
            except TypeError:
                mname = mw.col.models.get( mid )[ 'name' ]
                errorMsg( u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'.format( model=mname, field=fieldName ) )
                raise

            loc = fidDb.get( ( nid, guid, fieldName ), None )
            if not loc:
                loc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms: #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[ loc ] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    locDb[ newLoc ] = locDb.pop( loc )
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck( nid, fieldName, fieldValue, guid, mats )
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop( loc )
                    locDb[ newLoc ] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.')
        return None

    printf( 'Processed all %d notes in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.update( value=i, label='Creating all.db object' )
    allDb.clear()
    allDb.addFromLocDb( locDb )
    if cfg1('saveDbs'):
        mw.progress.update( value=i, label='Saving all.db to disk' )
        allDb.save( cfg1('path_all') )
        printf( 'Processed all %d notes + saved all.db in %f sec' % ( N_notes, time.time() - t_0 ) )
    mw.progress.finish()
    return allDb

예제 #14

0

파일 보기

def updateNotes(allDb):
    t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi = [], {}
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Updating data', max=N_notes, immediate=True)
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    # handle secondary databases
    mw.progress.update(label='Creating seen/known/mature from all.db')
    seenDb = filterDbByMat(allDb, cfg1('threshold_seen'))
    knownDb = filterDbByMat(allDb, cfg1('threshold_known'))
    matureDb = filterDbByMat(allDb, cfg1('threshold_mature'))
    mw.progress.update(label='Loading priority.db')
    priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db

    if cfg1('saveDbs'):
        mw.progress.update(label='Saving seen/known/mature dbs')
        seenDb.save(cfg1('path_seen'))
        knownDb.save(cfg1('path_known'))
        matureDb.save(cfg1('path_mature'))

    mw.progress.update(label='Calculating frequency information')
    pops = [len(locs) for locs in allDb.db.values()]
    pops = [n for n in pops if n > 1]

    mw.progress.update(label='Updating notes')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)
        if not C('enabled'): continue
        # Get all morphemes for note
        ms = set()
        for fieldName in C('morph_fields'):
            try:
                loc = fidDb[(nid, guid, fieldName)]
                ms.update(locDb[loc])
            except KeyError:
                continue
        ms = [m for m in ms if m.pos not in C('morph_blacklist')]

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for m in ms:
            if m not in seenDb.db: unseens.add(m)
            if m not in knownDb.db: unknowns.add(m)
            if m not in matureDb.db: unmatures.add(m)
            if m not in matureDb.db and m in knownDb.db:
                newKnowns.add(m)

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len(ms), len(unseens), len(unknowns), len(unmatures)

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

        # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns:  # focusMorph used outside loop
            F_k += len(allDb.db[focusMorph])
        F_k_avg = F_k / N_k if N_k > 0 else F_k
        usefulness = F_k_avg

        # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for m in newKnowns:
            locs = allDb.db[m]
            if locs:
                ivl = min(1, max(loc.maturity for loc in locs))
                usefulness += C(
                    'reinforce new vocab weight'
                ) / ivl  #TODO: maybe average this so it doesnt favor long sentences

        if any(m.pos == u'動詞'
               for m in unknowns):  #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min(999, usefulness)

        # difference from optimal length (too little context vs long sentence)
        lenDiff = max(0, min(9, abs(C('optimal sentence length') - N) - 2))
        tooLong = N > C('optimal sentence length')

        # calculate mmi
        mmi = 10000 * N_k + 1000 * lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[nid] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split(tags), splitFields(flds)
        # determine card type
        compTag, vocabTag, notReadyTag, alreadyKnownTag, priorityTag, badLengthTag, tooLongTag = tagNames = C(
            'tag_comprehension'), C('tag_vocab'), C('tag_notReady'), C(
                'tag_alreadyKnown'), C('tag_priority'), C('tag_badLength'), C(
                    'tag_tooLong')
        if N_m == 0:  # sentence comprehension card, m+0
            ts = [compTag
                  ] + [t for t in ts if t not in [vocabTag, notReadyTag]]
            setField(mid, fs, C('focusMorph'), u'')
        elif N_k == 1:  # new vocab card, k+1
            ts = [vocabTag
                  ] + [t for t in ts if t not in [compTag, notReadyTag]]
            setField(mid, fs, C('focusMorph'), u'%s' % focusMorph.base)
        elif N_k > 1:  # M+1+ and K+2+
            ts = [notReadyTag
                  ] + [t for t in ts if t not in [compTag, vocabTag]]

            # set type agnostic fields
        setField(mid, fs, C('k+N'), u'%d' % N_k)
        setField(mid, fs, C('m+N'), u'%d' % N_m)
        setField(mid, fs, C('morphManIndex'), u'%d' % mmi)
        setField(mid, fs, C('unknowns'), u', '.join(u.base for u in unknowns))
        setField(mid, fs, C('unmatures'),
                 u', '.join(u.base for u in unmatures))
        setField(mid, fs, C('unknownFreq'), u'%d' % F_k_avg)

        # other tags
        if priorityTag in ts: ts.remove(priorityTag)
        if isPriority: ts.append(priorityTag)

        if badLengthTag in ts: ts.remove(badLengthTag)
        if lenDiff: ts.append(badLengthTag)

        if tooLongTag in ts: ts.remove(tooLongTag)
        if tooLong: ts.append(tooLongTag)

        # update sql db
        tags_ = TAG.join(TAG.canonify(ts))
        flds_ = joinFields(fs)
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum(fs[0])
            sfld = stripHTML(fs[getSortFieldIndex(mid)])
            ds.append({
                'now': now,
                'tags': tags_,
                'flds': flds_,
                'sfld': sfld,
                'csum': csum,
                'usn': mw.col.usn(),
                'nid': nid
            })

    mw.progress.update(value=i, label='Updating anki database...')
    mw.col.db.executemany(
        'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid',
        ds)
    TAG.register(tagNames)

    # Now reorder new cards based on MMI
    mw.progress.update(value=i, label='Updating new card ordering...')
    ds = []
    for (cid, nid,
         due) in db.execute('select id, nid, due from cards where type = 0'):
        if nid in nid2mmi:  # owise it was disabled
            due_ = nid2mmi[nid]
            if due != due_:  # only update cards that have changed
                ds.append({
                    'now': now,
                    'due': due_,
                    'usn': mw.col.usn(),
                    'cid': cid
                })
    mw.col.db.executemany(
        'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds)
    mw.reset()

    printf('Updated notes in %f sec' % (time.time() - t_0))
    mw.progress.finish()
    return knownDb

예제 #15

0

파일 보기

파일: dumb_db.py 프로젝트: wlabelle/MorphMan

 def __init__(self, path=None, parent=None):
     self.db = MorphDb(path)

예제 #16

0

파일 보기

파일: manager.py 프로젝트: thaumkid/MorphMan

 def loadA(self):
     self.aPath = self.aPathLEdit.text()
     self.aDb = MorphDb(path=self.aPath)
     if not self.db:
         self.db = self.aDb

예제 #17

0

파일 보기

파일: util.py 프로젝트: whiteskulleton/JapaneseStudy

def allDb():
    global _allDb
    if _allDb is None:
        from morphemes import MorphDb
        _allDb = MorphDb(cfg1('path_all'), ignoreErrors=True)
    return _allDb

예제 #18

0

파일 보기

파일: main.py 프로젝트: thaumkid/MorphMan

def mkAllDb(allDb=None):
    import config
    reload(config)
    t_0, db, TAG = time.time(), mw.col.db, mw.col.tags
    N_notes = db.scalar('select count() from notes')
    N_enabled_notes = 0  # for providing an error message if there is no note that is used for processing
    mw.progress.start(label='Prep work for all.db creation',
                      max=N_notes,
                      immediate=True)

    if not allDb: allDb = MorphDb()
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    mw.progress.update(label='Generating all.db data')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None: continue
        morphemizer = getMorphemizerByName(notecfg['Morphemizer'])

        N_enabled_notes += 1

        mats = [(0.5 if ivl == 0 and ctype == 1 else ivl)
                for ivl, ctype in db.execute(
                    'select ivl, type from cards where nid = :nid', nid=nid)]
        if C('ignore maturity'):
            mats = [0 for mat in mats]
        ts, alreadyKnownTag = TAG.split(tags), jcfg('Tag_AlreadyKnown')
        if alreadyKnownTag in ts:
            mats += [C('threshold_mature') + 1]

        for fieldName in notecfg['Fields']:
            try:  # if doesn't have field, continue
                #fieldValue = normalizeFieldValue( getField( fieldName, flds, mid ) )
                fieldValue = extractFieldData(fieldName, flds, mid)
            except KeyError:
                continue
            except TypeError:
                mname = mw.col.models.get(mid)['name']
                errorMsg(
                    u'Failed to get field "{field}" from a note of model "{model}". Please fix your config.py file to match your collection appropriately and ignore the following error.'
                    .format(model=mname, field=fieldName))
                raise

            loc = fidDb.get((nid, guid, fieldName), None)
            if not loc:
                loc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                ms = getMorphemes(morphemizer, fieldValue, ts)
                if ms:  #TODO: this needed? should we change below too then?
                    #printf( '    .loc for %d[%s]' % ( nid, fieldName ) )
                    locDb[loc] = ms
            else:
                # mats changed -> new loc (new mats), move morphs
                if loc.fieldValue == fieldValue and loc.maturities != mats:
                    #printf( '    .mats for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    locDb[newLoc] = locDb.pop(loc)
                # field changed -> new loc, new morphs
                elif loc.fieldValue != fieldValue:
                    #printf( '    .morphs for %d[%s]' % ( nid, fieldName ) )
                    newLoc = AnkiDeck(nid, fieldName, fieldValue, guid, mats)
                    ms = getMorphemes(morphemizer, fieldValue, ts)
                    locDb.pop(loc)
                    locDb[newLoc] = ms

    if N_enabled_notes == 0:
        mw.progress.finish()
        errorMsg(
            u'There is no card that can be analyzed or be moved. Add cards or (re-)check your configuration under "Tools -> MorhpMan Preferences" or in "Anki/addons/morph/config.py" for mistakes.'
        )
        return None

    printf('Processed all %d notes in %f sec' % (N_notes, time.time() - t_0))
    mw.progress.update(value=i, label='Creating all.db object')
    allDb.clear()
    allDb.addFromLocDb(locDb)
    if cfg1('saveDbs'):
        mw.progress.update(value=i, label='Saving all.db to disk')
        allDb.save(cfg1('path_all'))
        printf('Processed all %d notes + saved all.db in %f sec' %
               (N_notes, time.time() - t_0))
    mw.progress.finish()
    return allDb

예제 #19

0

파일 보기

파일: manager.py 프로젝트: thaumkid/MorphMan

 def loadB(self):
     self.bPath = self.bPathLEdit.text()
     self.bDb = MorphDb(path=self.bPath)

예제 #20

0

파일 보기

파일: readability.py 프로젝트: wsavoie/Manabi

    def onAnalyze(self):
        self.morphemizer = getMorphemizerByName(p['DEFAULT']['currmophemizer'])
        input_path = False  # will set
        self.writeOutput('Using morphemizer: %s \n' %
                         self.morphemizer.getDescription())
        debug_output = False

        if p['DEFAULT'].getboolean('inputtype') and ~p['DEFAULT'].getboolean(
                'minimized'
        ):  # only uses fold when not minimized and inputtype is checked
            #TODO if certain keypress, force analyze through clipboard
            input_path = p['DEFAULT']['inputpath']
        minimum_master_frequency = p['DEFAULT'].getint('min_master_freq')
        readability_target = p['DEFAULT'].getfloat('read_target')
        master_freq_path = p['DEFAULT']['frequencylist']
        known_words_path = p['DEFAULT']['knownmorphs']
        ext_morphs = p['DEFAULT']['externalmorphs']

        output_path = p['DEFAULT']['outputpath']

        save_frequency_list = p['DEFAULT'].getboolean('save_freqency_list')
        save_word_report = p['DEFAULT'].getboolean('save_word_report')
        save_study_plan = p['DEFAULT'].getboolean('save_study_plan')

        source_score_multiplier = p['DEFAULT'].getfloat(
            'SourceScoreMultiplier')
        source_score_power = p['DEFAULT'].getfloat('SourceScorePower')

        proper_nouns_known = p['DEFAULT'].getboolean('ProperNounsAlreadyKnown')
        fill_all_morphs_in_plan = p['DEFAULT'].getboolean(
            'FillAllMorphsInStudyPlan')

        if not os.path.exists(output_path):
            try:
                os.makedirs(output_path)
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

        frequency_list_path = os.path.normpath(output_path + '/frequency.txt')
        word_report_path = os.path.normpath(output_path +
                                            '/word_freq_report.txt')
        study_plan_path = os.path.normpath(output_path + '/study_plan.txt')
        readability_log_path = os.path.normpath(output_path +
                                                '/readability_log.txt')

        log_fp = open(readability_log_path, 'wt', encoding='utf-8')

        master_db = MorphDb()
        unknown_db = MorphDb()

        master_total_instances = 0
        master_current_score = 0

        all_morphs = {}

        if os.path.isfile(master_freq_path):
            with io.open(master_freq_path, encoding='utf-8-sig') as csvfile:
                csvreader = csv.reader(csvfile, delimiter="\t")
                for row in csvreader:
                    try:
                        instances = int(row[0])
                        m = Morpheme(row[1], row[2], row[2], row[3], row[4],
                                     row[5])

                        master_db.addMorph(m, instances)
                        master_total_instances += instances
                    except:
                        pass
            self.writeOutput("Master morphs loaded: K %d V %d\n" %
                             (master_db.getTotalNormMorphs(),
                              master_db.getTotalVariationMorphs()))
        else:
            self.writeOutput("Master frequency file '%s' not found.\n" %
                             master_freq_path)
            minimum_master_frequency = 0

        if os.path.isfile(known_words_path):
            known_db = MorphDb(known_words_path, ignoreErrors=True)

            total_k = len(known_db.groups)
            total_v = len(known_db.db)
            self.writeOutput("Known morphs loaded: K %d V %d\n" %
                             (total_k, total_v))
        else:
            self.writeOutput("Known words DB '%s' not found\n" %
                             known_words_path)
            known_db = MorphDb()
        self.known_db = known_db
        if master_total_instances > 0:
            master_current_score = 0
            for ms in master_db.db.values():
                for m, c in ms.items():
                    if known_db.matches(m):
                        master_current_score += c[0]
                        c[1] = True  # mark matched
            self.writeOutput(
                "\n[Current master frequency readability] %0.02f\n" %
                (master_current_score * 100.0 / master_total_instances))

        sources = []

        def measure_readability(self, file_name, is_ass, is_srt):
            self.writeOutput(
                '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' %
                ("Input", "Total Morphs", "Known Morphs", "% Known Morphs",
                 "Total Instances", "Known Instances", "% Readability",
                 "% Proper Nouns", "% Known Lines", "% i+1 Lines"))

            #filename will be clipboard if reading from clipboard
            log_fp.write('measure_readability %s\n' % file_name)

            proper_noun_count = 0
            i_count = 0
            line_count = 0
            line_morphs = []
            known_line_count = 0
            iplus1_line_count = 0
            known_count = 0
            seen_morphs = {}
            known_morphs = {}
            source_unknown_db = MorphDb()

            def proc_lines(text, is_ass, is_srt):
                nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                text_index = -1
                num_fields = 1
                srt_count = 0

                def parse_text(text):
                    nonlocal i_count, known_count, seen_morphs, known_morphs, all_morphs
                    nonlocal proper_noun_count, line_count, known_line_count, iplus1_line_count, line_morphs

                    log_fp.write('=== parse_text ===\n' + text + '\n')
                    # print('strip',stripHTML(text))
                    parsed_morphs = getMorphemes(self.morphemizer,
                                                 stripHTML(text))
                    # parsed_morphs = getMorphemes(morphemizer, text)
                    if len(parsed_morphs) == 0:
                        return

                    unknown_count = 0
                    line_missing_morphs = set()
                    for m in parsed_morphs:
                        # Count morph for word report
                        all_morphs[m] = all_morphs.get(m, 0) + 1
                        seen_morphs[m] = seen_morphs.get(m, 0) + 1

                        if m.isProperNoun():
                            proper_noun_count += 1
                            is_proper_noun = True
                        else:
                            is_proper_noun = False

                        i_count += 1
                        if known_db.matches(
                                m
                        ) or is_proper_noun:  # Proper nouns are easy to learn, so assume they're known.
                            known_morphs[m] = known_morphs.get(m, 0) + 1
                            known_count += 1
                        else:
                            unknown_db.addMorph(m, 1)
                            source_unknown_db.addMorph(m, 1)
                            line_missing_morphs.add(m)
                            unknown_count += 1
                    line_count += 1
                    if unknown_count == 0:
                        known_line_count += 1
                    elif unknown_count == 1:
                        iplus1_line_count += 1
                    line_morphs.append(line_missing_morphs)

                filtered_text = ''
                for t in text.splitlines():
                    should_flush = True
                    if is_ass:
                        if 'Format:' in t:
                            formats = [x.strip() for x in t[8:].split(',')]
                            if 'Text' in formats:
                                text_index = formats.index('Text')
                                num_fields = len(formats)
                            else:
                                text_index = -1
                            continue
                        elif ('Dialogue:' not in t) or (text_index < 0):
                            continue
                        t = t[9:].split(',', num_fields - 1)
                        t = t[text_index]
                    elif is_srt:
                        srt_count += 1
                        if srt_count <= 2:
                            continue
                        elif t == '':
                            srt_count = 0
                        else:
                            should_flush = False

                    if t != '':
                        filtered_text += t + '\n'

                    # Todo: This will flush every line so we can compute per-line readability, which is slower than batching lines.
                    #       Figure out how to get per-line analysis with batched lines.
                    if should_flush:
                        #if len(filtered_text) >= 2048:
                        parse_text(filtered_text)
                        filtered_text = ''

                parse_text(filtered_text)

            try:
                if file_name == 'clipboard':
                    input = pyperclip.paste()
                else:
                    with open(file_name.strip(), 'rt', encoding='utf-8') as f:
                        input = f.read()

                input = input.replace(u'\ufeff', '')

                #input = [l.replace(u'\ufeff', '') for l in f.read()]
                proc_lines(input, is_ass, is_srt)
                source = Source(file_name, seen_morphs, line_morphs,
                                source_unknown_db)
                known_percent = 0.0 if len(
                    seen_morphs.keys()
                ) == 0 else 100.0 * len(known_morphs) / len(seen_morphs.keys())
                readability = 0.0 if i_count == 0 else 100.0 * known_count / i_count
                proper_noun_percent = 0.0 if line_count == 0 else 100.0 * proper_noun_count / i_count
                line_percent = 0.0 if line_count == 0 else 100.0 * known_line_count / line_count
                iplus1_percent = 0.0 if line_count == 0 else 100.0 * iplus1_line_count / line_count

                self.writeOutput(
                    '%s\t%d\t%d\t%0.2f\t%d\t%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n' %
                    (source.name, len(seen_morphs), len(known_morphs),
                     known_percent, i_count, known_count, readability,
                     proper_noun_percent, line_percent, iplus1_percent))
                # row = self.ui.readabilityTable.rowCount()
                # self.ui.readabilityTable.insertRow(row)
                # self.ui.readabilityTable.setItem(row, 0, QTableWidgetItem(source.name))
                # self.ui.readabilityTable.setItem(row, 1, TableInteger(len(seen_morphs)))
                # self.ui.readabilityTable.setItem(row, 2, TableInteger(len(known_morphs)))
                # self.ui.readabilityTable.setItem(row, 3, TablePercent(known_percent))
                # self.ui.readabilityTable.setItem(row, 4, TableInteger(i_count))
                # self.ui.readabilityTable.setItem(row, 5, TableInteger(known_count))
                # self.ui.readabilityTable.setItem(row, 6, TablePercent(readability))
                # self.ui.readabilityTable.setItem(row, 7, TablePercent(proper_noun_percent))
                # self.ui.readabilityTable.setItem(row, 8, TablePercent(line_percent))
                # self.ui.readabilityTable.setItem(row, 9, TablePercent(iplus1_percent))

                if save_study_plan:
                    sources.append(source)
            except:
                self.writeOutput("Failed to process '%s'\n" % file_name)
                raise

        def accepted_filetype(filename):
            return filename.lower().endswith(('.srt', '.ass', '.txt'))

        list_of_files = None
        ####################

        if os.path.isfile(input_path) or os.path.isdir(input_path):
            list_of_files = list()
            print('getting info from files!')
        ###################

        if list_of_files is not list():

            for (dirpath, _, filenames) in os.walk(input_path):
                list_of_files += [
                    os.path.join(dirpath, filename) for filename in filenames
                    if accepted_filetype(filename)
                ]

            # self.ui.readabilityTable.clear()
            # self.ui.readabilityTable.setRowCount(0)
            # self.ui.readabilityTable.setColumnCount(10)
            # self.ui.readabilityTable.setHorizontalHeaderLabels([
            #     "Input", "Total\nMorphs", "Known\nMorphs", "Known\nMorphs %", "Total\nInstances", "Known\nInstances",
            #     "Morph\nReadability %", "Proper\nNoun %", "Line\nReadability %", "i+1\nLines %"])

            if len(list_of_files) > 0:

                #     mw.progress.start( label='Measuring readability', max=len(list_of_files), immediate=True )
                for n, file_path in enumerate(
                        sorted(list_of_files, key=natural_keys)):
                    #         mw.progress.update(value=n, label='Parsing (%d/%d) %s' % (
                    #             n + 1, len(list_of_files), os.path.basename(file_path)))
                    #TODO ADD PROGRESS BAR
                    if os.path.isfile(file_path):
                        is_ass = os.path.splitext(
                            file_path)[1].lower() == '.ass'
                        is_srt = os.path.splitext(
                            file_path)[1].lower() == '.srt'
                        measure_readability(self, file_path, is_ass, is_srt)
            #     mw.progress.finish()
            else:
                self.writeOutput('\nNo files found to process.\n')
                return
        else:
            measure_readability(self, 'clipboard', 0, 0)  # for clipboard run
        # self.ui.readabilityTable.resizeColumnsToContents()

        if save_word_report:
            self.writeOutput("\n[Saving word report to '%s'...]\n" %
                             word_report_path)
            with open(word_report_path, 'wt', encoding='utf-8') as f:
                last_count = 0
                morph_idx = 0
                group_idx = 0
                morph_total = 0.0
                all_morphs_count = sum(n for n in all_morphs.values())

                for m in sorted(all_morphs.items(),
                                key=operator.itemgetter(1),
                                reverse=True):
                    if m[1] != last_count:
                        last_count = m[1]
                        group_idx += 1
                    morph_idx += 1
                    morph_delta = 100.0 * m[1] / all_morphs_count
                    morph_total += morph_delta
                    print(
                        '%d\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%0.8f\t%0.8f matches %d'
                        % (m[1], m[0].norm, m[0].base, m[0].read, m[0].pos,
                           m[0].subPos, group_idx, morph_idx, morph_delta,
                           morph_total, known_db.matches(m[0])),
                        file=f)

        learned_tot = 0
        learned_morphs = []

        all_missing_morphs = []

        def get_line_readability(show, known_db):
            known_lines = 0
            for line_morphs in show.line_morphs:
                has_unknowns = False
                for m in line_morphs:
                    if known_db.matches(m):
                        continue
                    has_unknowns = True
                if not has_unknowns:
                    known_lines += 1
            line_readability = 0.0 if known_lines == 0 else 100.0 * known_lines / len(
                show.line_morphs)
            return line_readability

        if save_study_plan:
            self.writeOutput("\n[Saving Study Plan to '%s'...]\n" %
                             study_plan_path)
            with open(study_plan_path, 'wt', encoding='utf-8') as f:
                # self.ui.studyPlanTable.clear()
                # self.ui.studyPlanTable.setRowCount(0)
                # self.ui.studyPlanTable.setColumnCount(7)
                # self.ui.studyPlanTable.setHorizontalHeaderLabels([
                #     "Input", "To Study\nMorphs ", "Cummulative\nMorphs", "Old Morph\nReadability %", "New Morph\nReadability %",
                #     "Old Line\nReadability %", "New Line\nReadability %"])

                # mw.progress.start( label='Building study plan', max=len(sources), immediate=True )

                for n, s in enumerate(sources):
                    # mw.progress.update( value=n, label='Processing (%d/%d) %s' % (n+1, len(sources), os.path.basename(s.name)) )
                    # if debug_output: f.write('Processing %s\n' % s.name)

                    known_i = 0
                    seen_i = 0
                    learned_m = 0
                    missing_morphs = []

                    old_line_readability = get_line_readability(s, known_db)

                    for m in s.morphs.items():
                        seen_i += m[1]
                        morph = m[0]
                        if known_db.matches(morph) or (proper_nouns_known and
                                                       morph.isProperNoun()):
                            known_i += m[1]
                        else:
                            source_unknown_count = s.unknown_db.getFuzzyCount(
                                morph, known_db)
                            unknown_count = unknown_db.getFuzzyCount(
                                morph, known_db)
                            master_count = master_db.getFuzzyCount(
                                morph, known_db)
                            source_count = source_unknown_count + unknown_count

                            score = pow(
                                source_count, source_score_power
                            ) * source_score_multiplier + master_count
                            missing_morphs.append(
                                (m[0], m[1], source_unknown_count,
                                 unknown_count, master_count, score))

                            if debug_output:
                                f.write(
                                    '  missing: ' + m[0].show() +
                                    '\t[score %d ep_freq %d all_freq %d master_freq %d]\n'
                                    % (score, source_unknown_count,
                                       unknown_count, master_count))

                    all_missing_morphs += missing_morphs
                    readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i
                    old_readability = readability

                    learned_this_source = []

                    for m in sorted(missing_morphs,
                                    key=operator.itemgetter(5),
                                    reverse=True):
                        if readability >= readability_target:
                            if debug_output:
                                f.write('  readability target reached\n')
                            break

                        if known_db.matches(m[0]):
                            if debug_output:
                                f.write('  known: %s\n' % m[0].show())
                            continue

                        if m[4] < minimum_master_frequency:
                            if debug_output:
                                f.write(
                                    '  low score: %s [score %d ep_freq %d all_freq %d master_freq %d]\n'
                                    % (m[0].show(), m[5], m[2], m[3], m[4]))
                            continue

                        learned_morphs.append(m)
                        learned_this_source.append(m)
                        known_i += s.unknown_db.getFuzzyCount(m[0], known_db)
                        learned_m += 1
                        readability = 100.0 if seen_i == 0 else known_i * 100.0 / seen_i
                        known_db.addMLs1(m[0], set())

                    new_line_readability = get_line_readability(s, known_db)

                    learned_tot += learned_m
                    source_str = "'%s' study goal: (%3d/%4d) morph readability: %0.2f -> %0.2f line readabiltiy: %0.2f -> %0.2f\n" % (
                        s.name, learned_m, learned_tot, old_readability,
                        readability, old_line_readability,
                        new_line_readability)
                    self.writeOutput(source_str)
                    f.write(source_str)

                    # row = self.ui.studyPlanTable.rowCount()
                    # self.ui.studyPlanTable.insertRow(row)
                    # self.ui.studyPlanTable.setItem(row, 0, QTableWidgetItem(s.name))
                    # self.ui.studyPlanTable.setItem(row, 1, TableInteger(learned_m))
                    # self.ui.studyPlanTable.setItem(row, 2, TableInteger(learned_tot))
                    # self.ui.studyPlanTable.setItem(row, 3, TablePercent(old_readability))
                    # self.ui.studyPlanTable.setItem(row, 4, TablePercent(readability))
                    # self.ui.studyPlanTable.setItem(row, 5, TablePercent(old_line_readability))
                    # self.ui.studyPlanTable.setItem(row, 6, TablePercent(new_line_readability))

                    for m in learned_this_source:
                        f.write(
                            '\t' + m[0].show() +
                            '\t[score %d ep_freq %d all_freq %d master_freq %d]\n'
                            % (m[5], m[2], m[3], m[4]))

                # self.ui.studyPlanTable.resizeColumnsToContents()
                # mw.progress.finish()

                if save_frequency_list:
                    self.writeOutput("\n[Saving frequency list to '%s'...]\n" %
                                     frequency_list_path)
                    with open(frequency_list_path, 'wt',
                              encoding='utf-8') as f:
                        unique_set = set()
                        # First output morphs according to the plan.
                        for m in learned_morphs:
                            if m[0].base in unique_set:
                                continue
                            unique_set.add(m[0].base)
                            print(
                                m[0].base +
                                '\t[score %d ep_freq %d all_freq %d master_freq %d]'
                                % (m[5], m[2], m[3], m[4]),
                                file=f)

                        # Followed by all remaining morphs sorted by score.
                        if fill_all_morphs_in_plan:
                            for m in sorted(all_missing_morphs,
                                            key=operator.itemgetter(5),
                                            reverse=True):
                                if (m[0].base in unique_set):
                                    continue
                                if m[4] < minimum_master_frequency:
                                    continue
                                unique_set.add(m[0].base)
                                print(
                                    m[0].base +
                                    '\t[score %d ep_freq %d all_freq %d master_freq %d]'
                                    % (m[5], m[2], m[3], m[4]),
                                    file=f)

                if master_total_instances > 0:
                    master_score = 0
                    for ms in master_db.db.values():
                        for m, c in ms.items():
                            if known_db.matches(m):
                                master_score += c[0]
                                c[1] = True  # mark matched
                    self.writeOutput(
                        "\n[New master frequency readability] %0.02f -> %0.02f\n"
                        %
                        (master_current_score * 100.0 / master_total_instances,
                         master_score * 100.0 / master_total_instances))

예제 #21

0

파일 보기

파일: main.py 프로젝트: thaumkid/MorphMan

def updateNotes(allDb):
    t_0, now, db, TAG = time.time(), intTime(), mw.col.db, mw.col.tags
    ds, nid2mmi = [], {}
    N_notes = db.scalar('select count() from notes')
    mw.progress.start(label='Updating data', max=N_notes, immediate=True)
    fidDb = allDb.fidDb()
    locDb = allDb.locDb(recalc=False)  # fidDb() already forces locDb recalc

    # read tag names
    compTag, vocabTag, freshTag, notReadyTag, alreadyKnownTag, priorityTag, tooShortTag, tooLongTag = tagNames = jcfg(
        'Tag_Comprehension'), jcfg('Tag_Vocab'), jcfg('Tag_Fresh'), jcfg(
            'Tag_NotReady'), jcfg('Tag_AlreadyKnown'), jcfg(
                'Tag_Priority'), jcfg('Tag_TooShort'), jcfg('Tag_TooLong')
    TAG.register(tagNames)
    badLengthTag = jcfg2().get('Tag_BadLength')

    # handle secondary databases
    mw.progress.update(label='Creating seen/known/mature from all.db')
    seenDb = filterDbByMat(allDb, cfg1('threshold_seen'))
    knownDb = filterDbByMat(allDb, cfg1('threshold_known'))
    matureDb = filterDbByMat(allDb, cfg1('threshold_mature'))
    mw.progress.update(label='Loading priority.db')
    priorityDb = MorphDb(cfg1('path_priority'), ignoreErrors=True).db

    if cfg1('saveDbs'):
        mw.progress.update(label='Saving seen/known/mature dbs')
        seenDb.save(cfg1('path_seen'))
        knownDb.save(cfg1('path_known'))
        matureDb.save(cfg1('path_mature'))

    mw.progress.update(label='Updating notes')
    for i, (nid, mid, flds, guid, tags) in enumerate(
            db.execute('select id, mid, flds, guid, tags from notes')):
        if i % 500 == 0: mw.progress.update(value=i)
        C = partial(cfg, mid, None)

        note = mw.col.getNote(nid)
        notecfg = getFilter(note)
        if notecfg is None or not notecfg['Modify']: continue

        # Get all morphemes for note
        morphemes = set()
        for fieldName in notecfg['Fields']:
            try:
                loc = fidDb[(nid, guid, fieldName)]
                morphemes.update(locDb[loc])
            except KeyError:
                continue

        # Determine un-seen/known/mature and i+N
        unseens, unknowns, unmatures, newKnowns = set(), set(), set(), set()
        for morpheme in morphemes:
            if morpheme not in seenDb.db: unseens.add(morpheme)
            if morpheme not in knownDb.db: unknowns.add(morpheme)
            if morpheme not in matureDb.db: unmatures.add(morpheme)
            if morpheme not in matureDb.db and morpheme in knownDb.db:
                newKnowns.add(morpheme)

        # Determine MMI - Morph Man Index
        N, N_s, N_k, N_m = len(morphemes), len(unseens), len(unknowns), len(
            unmatures)

        # Bail early for lite update
        if N_k > 2 and C('only update k+2 and below'): continue

        # average frequency of unknowns (ie. how common the word is within your collection)
        F_k = 0
        for focusMorph in unknowns:  # focusMorph used outside loop
            F_k += allDb.frequency(focusMorph)
        F_k_avg = F_k // N_k if N_k > 0 else F_k
        usefulness = F_k_avg

        # add bonus for morphs in priority.db
        isPriority = False
        for focusMorph in unknowns:
            if focusMorph in priorityDb:
                isPriority = True
                usefulness += C('priority.db weight')

            # add bonus for studying recent learned knowns (reinforce)
        for morpheme in newKnowns:
            locs = allDb.db[morpheme]
            if locs:
                ivl = min(1, max(loc.maturity for loc in locs))
                usefulness += C(
                    'reinforce new vocab weight'
                ) // ivl  #TODO: maybe average this so it doesnt favor long sentences

        if any(morpheme.pos == u'動詞'
               for morpheme in unknowns):  #FIXME: this isn't working???
            usefulness += C('verb bonus')

        usefulness = 999 - min(999, usefulness)

        # difference from optimal length range (too little context vs long sentence)
        lenDiffRaw = min(N - C('min good sentence length'),
                         max(0, N - C('max good sentence length')))
        lenDiff = min(9, abs(lenDiffRaw))

        # calculate mmi
        mmi = 10000 * N_k + 1000 * lenDiff + usefulness
        if C('set due based on mmi'):
            nid2mmi[nid] = mmi

        # Fill in various fields/tags on the note based on cfg
        ts, fs = TAG.split(tags), splitFields(flds)

        # clear any 'special' tags, the appropriate will be set in the next few lines
        ts = [
            t for t in ts
            if t not in [notReadyTag, compTag, vocabTag, freshTag]
        ]

        # determine card type
        if N_m == 0:  # sentence comprehension card, m+0
            ts = ts + [compTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')
        elif N_k == 1:  # new vocab card, k+1
            ts = ts + [vocabTag]
            setField(mid, fs, jcfg('Field_FocusMorph'),
                     u'%s' % focusMorph.base)
        elif N_k > 1:  # M+1+ and K+2+
            ts = ts + [notReadyTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')
        elif N_m == 1:  # we have k+0, and m+1, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [freshTag]
            setField(mid, fs, jcfg('Field_FocusMorph'),
                     u'%s' % list(unmatures)[0].base)
        else:  # only case left: we have k+0, but m+2 or higher, so this card does not introduce a new vocabulary -> card for newly learned morpheme
            ts = ts + [freshTag]
            setField(mid, fs, jcfg('Field_FocusMorph'), u'')

            # set type agnostic fields
        setField(mid, fs, jcfg('Field_UnknownMorphCount'), u'%d' % N_k)
        setField(mid, fs, jcfg('Field_UnmatureMorphCount'), u'%d' % N_m)
        setField(mid, fs, jcfg('Field_MorphManIndex'), u'%d' % mmi)
        setField(mid, fs, jcfg('Field_Unknowns'),
                 u', '.join(u.base for u in unknowns))
        setField(mid, fs, jcfg('Field_Unmatures'),
                 u', '.join(u.base for u in unmatures))
        setField(mid, fs, jcfg('Field_UnknownFreq'), u'%d' % F_k_avg)

        # remove deprecated tag
        if badLengthTag is not None and badLengthTag in ts:
            ts.remove(badLengthTag)

            # other tags
        if priorityTag in ts: ts.remove(priorityTag)
        if isPriority: ts.append(priorityTag)

        if tooShortTag in ts: ts.remove(tooShortTag)
        if lenDiffRaw < 0: ts.append(tooShortTag)

        if tooLongTag in ts: ts.remove(tooLongTag)
        if lenDiffRaw > 0: ts.append(tooLongTag)

        # remove unnecessary tags
        if not jcfg('Option_SetNotRequiredTags'):
            unnecessary = [priorityTag, tooShortTag, tooLongTag]
            ts = [tag for tag in ts if tag not in unnecessary]

            # update sql db
        tags_ = TAG.join(TAG.canonify(ts))
        flds_ = joinFields(fs)
        if flds != flds_ or tags != tags_:  # only update notes that have changed
            csum = fieldChecksum(fs[0])
            sfld = stripHTML(fs[getSortFieldIndex(mid)])
            ds.append({
                'now': now,
                'tags': tags_,
                'flds': flds_,
                'sfld': sfld,
                'csum': csum,
                'usn': mw.col.usn(),
                'nid': nid
            })

    mw.progress.update(value=i, label='Updating anki database...')
    mw.col.db.executemany(
        'update notes set tags=:tags, flds=:flds, sfld=:sfld, csum=:csum, mod=:now, usn=:usn where id=:nid',
        ds)

    # Now reorder new cards based on MMI
    mw.progress.update(value=i, label='Updating new card ordering...')
    ds = []

    # "type = 0": new cards
    # "type = 1": learning cards [is supposed to be learning: in my case no learning card had this type]
    # "type = 2": review cards
    for (cid, nid,
         due) in db.execute('select id, nid, due from cards where type = 0'):
        if nid in nid2mmi:  # owise it was disabled
            due_ = nid2mmi[nid]
            if due != due_:  # only update cards that have changed
                ds.append({
                    'now': now,
                    'due': due_,
                    'usn': mw.col.usn(),
                    'cid': cid
                })
    mw.col.db.executemany(
        'update cards set due=:due, mod=:now, usn=:usn where id=:cid', ds)
    mw.reset()

    printf('Updated notes in %f sec' % (time.time() - t_0))
    mw.progress.finish()
    return knownDb

예제 #22

0

파일 보기

파일: database_manager.py 프로젝트: wsavoie/Manabi

 def loadB(self):
     self.bPath = self.dbBPath.text()
     self.bDb = MorphDb(path=self.bPath)