Пример #1
0
def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ):
    # Load files
    kdb = M.MorphDb( knownDbPath )
    mdb = M.MorphDb( matureDbPath )
    subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines()
    # Start Mecab
    mp = M.mecab()

    # Get dueling subs
    dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ]
    header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ]
    assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange( 0, len( dialogueLines ), 2 ):
        jpn, eng = dialogueLines[i:i+2]
        jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn )

        # get unknowns
        ms = M.getMorphemes( mp, jpn, ws=whitelist, bs=blacklist )
        unknowns, N_k = getNotInDb( ms, kdb.db )
        unmatures, N_m = getNotInDb( ms, mdb.db )
        d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures }

        if N_m == 0:
            lines.append( pre + matureFmt % d )
        elif N_k == 0:
            lines.append( pre + knownFmt % d )
        else:
            lines.append( pre + unknownFmt % d )

    outFile = codecs.open( outputSubsPath, 'w', 'utf-8' )
    outFile.write( u''.join( header ) )
    outFile.write( u'\n'.join( lines ) )
    outFile.close()
Пример #2
0
def pre( ed ):
   path = QFileDialog.getOpenFileName( caption='Open db', directory=util.knownDbPath )
   if not path: return 'BAIL'
   bs = util.getBlacklist( ed )

   db = M.MorphDb( path ).db
   return { 'mp':M.mecab(None), 'fmmap':{}, 'mfmap':{}, 'db':db, 'bs':bs, 'ed':ed }
Пример #3
0
def pre( ed ):
    name, ok = QInputDialog.getText( ed, 'Enter name of source', 'Name', QLineEdit.Normal, 'recentSelection' )
    if not ok: return 'BAIL'
    defPath = util.dbPath + name + '.db'
    path = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=defPath )
    if not path: return 'BAIL'
    return { 'ed':ed, 'srcName':name, 'filePath':path, 'ms':[], 'mp':m.mecab(None) }
Пример #4
0
def pre( ed ):
   tags, ok = QInputDialog.getText( ed, 'Enter Tags', 'Tags', QLineEdit.Normal, 'myMorph' )
   if not ok: return
   msStr, ok = QInputDialog.getText( ed, 'Enter morphemes', 'Morphemes', QLineEdit.Normal, '' )
   if not ok: return
   bs = util.getBlacklist( ed )
   ms = [ tuple( row.split('\t') ) for row in msStr.split('\n') ]
   return { 'mp':M.mecab(None), 'ms':ms, 'tags':tags, 'bs':bs, 'ed':ed }
Пример #5
0
def pre( ed ):
   tags, ok = QInputDialog.getText( ed, 'Enter Tags', 'Tags', QLineEdit.Normal, 'myMorph' )
   if not ok or not tags: return 'BAIL'
   path = QFileDialog.getOpenFileName( caption='Open db', directory=util.knownDbPath )
   if not path: return 'BAIL'
   bs = util.getBlacklist( ed )

   db = M.MorphDb( path )
   return { 'mp':M.mecab(), 'db':db, 'tags':unicode(tags), 'bs':bs, 'ed':ed }
Пример #6
0
def post( st ):
   import morphemes as m
   mp = m.mecab( None )
   ms = m.getMorphemes( mp, st['txt'], bs=st['bs'] )
   mp.kill()
   txt = m.ms2str( ms ).decode('utf-8')

   kdb = m.loadDb( util.knownDbPath )
   newMs = [ x for x in ms if x not in kdb ]
   newTxt = m.ms2str( newMs ).decode('utf-8')

   txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt
   QMessageBox.information( st['ed'], 'Morphemes', txt )
Пример #7
0
def pre(ed):
    path = QFileDialog.getOpenFileName(caption='Open db',
                                       directory=util.knownDbPath)
    if not path: return 'BAIL'
    bs = util.getBlacklist(ed)

    db = M.MorphDb(path).db
    return {
        'mp': M.mecab(None),
        'fmmap': {},
        'mfmap': {},
        'db': db,
        'bs': bs,
        'ed': ed
    }
Пример #8
0
def pre(ed):
    field, ok = QInputDialog.getText(ed, 'Enter name of field to extract from',
                                     'Field Name', QLineEdit.Normal,
                                     'Expression')
    if not ok: return 'BAIL'
    defPath = util.dbPath + 'mySelection.db'
    path = QFileDialog.getSaveFileName(caption='Save morpheme db to?',
                                       directory=defPath)
    if not path: return 'BAIL'
    return {
        'ed': ed,
        'fieldName': field,
        'filePath': str(path),
        'db': M.MorphDb(),
        'mp': M.mecab()
    }
Пример #9
0
def pre(ed):
    tags, ok = QInputDialog.getText(ed, 'Enter Tags', 'Tags', QLineEdit.Normal,
                                    'myMorph')
    if not ok or not tags: return 'BAIL'
    path = QFileDialog.getOpenFileName(caption='Open db',
                                       directory=util.knownDbPath)
    if not path: return 'BAIL'
    bs = util.getBlacklist(ed)

    db = M.MorphDb(path)
    return {
        'mp': M.mecab(),
        'db': db,
        'tags': unicode(tags),
        'bs': bs,
        'ed': ed
    }
Пример #10
0
def run(duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt,
        knownFmt, unknownFmt):
    # Load files
    kdb = M.MorphDb(knownDbPath)
    mdb = M.MorphDb(matureDbPath)
    subFileLines = codecs.open(duelingSubsPath, 'r', 'utf-8').readlines()
    # Start Mecab
    mp = M.mecab()

    # Get dueling subs
    dialogueLines = [l for l in subFileLines if l.startswith(u'Dialogue')]
    header = subFileLines[:subFileLines.index(dialogueLines[0])]
    assert len(
        dialogueLines) % 2 == 0, 'Should be an even number of dialogue lines'

    lines = []
    for i in xrange(0, len(dialogueLines), 2):
        jpn, eng = dialogueLines[i:i + 2]
        jpn, eng, pre = getText(jpn), getText(eng), getPreText(jpn)

        # get unknowns
        ms = M.getMorphemes(mp, jpn, ws=whitelist, bs=blacklist)
        unknowns, N_k = getNotInDb(ms, kdb.db)
        unmatures, N_m = getNotInDb(ms, mdb.db)
        d = {
            'jpn': jpn,
            'eng': eng,
            'N_k': N_k,
            'N_m': N_m,
            'unknowns': unknowns,
            'unmatures': unmatures
        }

        if N_m == 0:
            lines.append(pre + matureFmt % d)
        elif N_k == 0:
            lines.append(pre + knownFmt % d)
        else:
            lines.append(pre + unknownFmt % d)

    outFile = codecs.open(outputSubsPath, 'w', 'utf-8')
    outFile.write(u''.join(header))
    outFile.write(u'\n'.join(lines))
    outFile.close()
Пример #11
0
def pre( ed ):
    if not util.requireKnownDb(): return 'BAIL'
    kdb = M.loadDb( util.knownDbPath )
    rdb = R.mkRankDb( kdb )
    return { 'rdb':rdb, 'mp':M.mecab(None) }
Пример #12
0
def pre( ed ):
   if not util.requireKnownDb(): return 'BAIL'
   bs = util.getBlacklist( ed )
   return { 'ed':ed, 'txt':'', 'bs':bs, 'mp':M.mecab() }
Пример #13
0
    def mkAll( self ): # IO ()
        log( 'Getting initial all.db...' )
        if not hasattr( self, '_allDb' ):
            try:
                self._allDb = M.MorphDb( self.allPath )
                debug( '  * Updating existing all.db' )
            except IOError:
                self._allDb = M.MorphDb()
                debug( '  * Creating new all.db from scratch' )
        allDb = self._allDb
        log( '...done' )

        mp = M.mecab()

        # pre-cache lookups
        fieldNames = self.cfg['morph fields']
        whitelist, blacklist = parseWhitelist( self.cfg['whitelist'] ), parseWhitelist( self.cfg['blacklist'] )
        fid2cardsDb = self.fid2cardsDb()
        fidDb = allDb.fidDb()
        locDb = allDb.locDb()
        fs = self.getFacts()

        i, lfs = 0, len( fs )
        start = time.time()
        last = time.time()
        for f in fs:
            mats = [ c.interval for c in fid2cardsDb[ f.id ] ]
            for fieldName in fieldNames:
                try:
                    fieldValue = normalizeFieldValue( f[ fieldName ] )
                except KeyError: # if fact doesn't have the field just skip it
                    continue
                try: # existing location
                    loc = fidDb[ (f.id, fieldName) ]
                    # new loc only; no morpheme change
                    if loc.fieldValue == fieldValue and loc.maturities != mats:
                        debug('        .mats for %d[%s]' % ( f.id, fieldName ) )
                        newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                        ms = locDb.pop( loc )
                        locDb[ newLoc ] = ms
                    # new loc and new morphemes
                    elif loc.fieldValue != fieldValue:
                        debug('        .morphs for %d[%s]' % ( f.id, fieldName ) )
                        newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                        ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist )
                        locDb.pop( loc )
                        locDb[ newLoc ] = ms
                except KeyError: # new location
                    loc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats )
                    ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist )
                    if ms:
                        debug('        .loc for %d[%s]' % ( f.id, fieldName ) )
                        locDb[ loc ] = ms
            i += 1
            if i % 100 == 0:
                log('    %d / %d = %d%% in %f sec' % ( i, lfs, 100.*i/lfs, time.time()-last ) )
                last = time.time()
        log( 'Proccessed all facts in %f sec. Now saving...' % ( time.time()-start ) )
        allDb.clear()
        allDb.addFromLocDb( locDb )
        allDb.save( self.allPath )
        self.cfg['last db update'][ self.allPath ] = time.time()
        self.cfg['last all.db update took'] = time.time() - start
        log( '...done' )
        sigterm( mp )
        return self._allDb
Пример #14
0
def pre( ed ):
    field, ok = QInputDialog.getText( ed, 'Enter name of field to extract from', 'Field Name', QLineEdit.Normal, 'Expression' )
    if not ok: return 'BAIL'
    defPath = util.dbPath + 'mySelection.db'
    path = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=defPath )
    if not path: return 'BAIL'
    return { 'ed':ed, 'fieldName':field, 'filePath':str(path), 'db':M.MorphDb(), 'mp':M.mecab() }
Пример #15
0
    def mkAll(self):  # IO ()
        log('Getting initial all.db...')
        if not hasattr(self, '_allDb'):
            try:
                self._allDb = M.MorphDb(self.allPath)
                debug('  * Updating existing all.db')
            except IOError:
                self._allDb = M.MorphDb()
                debug('  * Creating new all.db from scratch')
        allDb = self._allDb
        log('...done')

        mp = M.mecab()

        # pre-cache lookups
        fieldNames = self.cfg['morph fields']
        whitelist, blacklist = self.cfg['whitelist'], self.cfg['blacklist']
        fid2cardsDb = self.fid2cardsDb()
        fidDb = allDb.fidDb()
        locDb = allDb.locDb()
        fs = self.getFacts()

        i, lfs = 0, len(fs)
        start = time.time()
        last = time.time()
        for f in fs:
            mats = [c.interval for c in fid2cardsDb[f.id]]
            for fieldName in fieldNames:
                try:
                    fieldValue = normalizeFieldValue(f[fieldName])
                except KeyError:  # if fact doesn't have the field just skip it
                    continue
                try:  # existing location
                    loc = fidDb[(f.id, fieldName)]
                    # new loc only; no morpheme change
                    if loc.fieldValue == fieldValue and loc.maturities != mats:
                        debug('        .mats for %d[%s]' % (f.id, fieldName))
                        newLoc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                            self.deckPath, self.deckName, mats)
                        ms = locDb.pop(loc)
                        locDb[newLoc] = ms
                    # new loc and new morphemes
                    elif loc.fieldValue != fieldValue:
                        debug('        .morphs for %d[%s]' % (f.id, fieldName))
                        newLoc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                            self.deckPath, self.deckName, mats)
                        ms = M.getMorphemes(mp,
                                            fieldValue,
                                            ws=whitelist,
                                            bs=blacklist)
                        locDb.pop(loc)
                        locDb[newLoc] = ms
                except KeyError:  # new location
                    loc = M.AnkiDeck(f.id, fieldName, fieldValue,
                                     self.deckPath, self.deckName, mats)
                    ms = M.getMorphemes(mp,
                                        fieldValue,
                                        ws=whitelist,
                                        bs=blacklist)
                    if ms:
                        debug('        .loc for %d[%s]' % (f.id, fieldName))
                        locDb[loc] = ms
            i += 1
            if i % 100 == 0:
                log('    %d / %d = %d%% in %f sec' %
                    (i, lfs, 100. * i / lfs, time.time() - last))
                last = time.time()
        log('Proccessed all facts in %f sec. Now saving...' %
            (time.time() - start))
        allDb.clear()
        allDb.addFromLocDb(locDb)
        allDb.save(self.allPath)
        self.cfg['last db update'][self.allPath] = time.time()
        self.cfg['last all.db update took'] = time.time() - start
        log('...done')
        sigterm(mp)
        return self._allDb
Пример #16
0
def pre( ed ):
   if not util.requireKnownDb(): return 'BAIL'
   bs = util.getBlacklist( ed )
   return { 'bs':bs, 'kdb': m.loadDb( util.knownDbPath ), 'mp':m.mecab(None) }
Пример #17
0
def pre( ed ):
   if not util.requireKnownDb(): return 'BAIL'
   bs = util.getBlacklist( ed )
   return { 'ed':ed, 'txt':'', 'bs':bs, 'mp':M.mecab() }