def run( duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt ): # Load files kdb = M.MorphDb( knownDbPath ) mdb = M.MorphDb( matureDbPath ) subFileLines = codecs.open( duelingSubsPath, 'r', 'utf-8' ).readlines() # Start Mecab mp = M.mecab() # Get dueling subs dialogueLines = [ l for l in subFileLines if l.startswith( u'Dialogue' ) ] header = subFileLines[ : subFileLines.index( dialogueLines[0] ) ] assert len( dialogueLines ) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange( 0, len( dialogueLines ), 2 ): jpn, eng = dialogueLines[i:i+2] jpn, eng, pre = getText( jpn ), getText( eng ), getPreText( jpn ) # get unknowns ms = M.getMorphemes( mp, jpn, ws=whitelist, bs=blacklist ) unknowns, N_k = getNotInDb( ms, kdb.db ) unmatures, N_m = getNotInDb( ms, mdb.db ) d = { 'jpn':jpn, 'eng':eng, 'N_k':N_k, 'N_m':N_m, 'unknowns':unknowns, 'unmatures':unmatures } if N_m == 0: lines.append( pre + matureFmt % d ) elif N_k == 0: lines.append( pre + knownFmt % d ) else: lines.append( pre + unknownFmt % d ) outFile = codecs.open( outputSubsPath, 'w', 'utf-8' ) outFile.write( u''.join( header ) ) outFile.write( u'\n'.join( lines ) ) outFile.close()
def pre( ed ): path = QFileDialog.getOpenFileName( caption='Open db', directory=util.knownDbPath ) if not path: return 'BAIL' bs = util.getBlacklist( ed ) db = M.MorphDb( path ).db return { 'mp':M.mecab(None), 'fmmap':{}, 'mfmap':{}, 'db':db, 'bs':bs, 'ed':ed }
def pre( ed ): name, ok = QInputDialog.getText( ed, 'Enter name of source', 'Name', QLineEdit.Normal, 'recentSelection' ) if not ok: return 'BAIL' defPath = util.dbPath + name + '.db' path = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=defPath ) if not path: return 'BAIL' return { 'ed':ed, 'srcName':name, 'filePath':path, 'ms':[], 'mp':m.mecab(None) }
def pre( ed ): tags, ok = QInputDialog.getText( ed, 'Enter Tags', 'Tags', QLineEdit.Normal, 'myMorph' ) if not ok: return msStr, ok = QInputDialog.getText( ed, 'Enter morphemes', 'Morphemes', QLineEdit.Normal, '' ) if not ok: return bs = util.getBlacklist( ed ) ms = [ tuple( row.split('\t') ) for row in msStr.split('\n') ] return { 'mp':M.mecab(None), 'ms':ms, 'tags':tags, 'bs':bs, 'ed':ed }
def pre( ed ): tags, ok = QInputDialog.getText( ed, 'Enter Tags', 'Tags', QLineEdit.Normal, 'myMorph' ) if not ok or not tags: return 'BAIL' path = QFileDialog.getOpenFileName( caption='Open db', directory=util.knownDbPath ) if not path: return 'BAIL' bs = util.getBlacklist( ed ) db = M.MorphDb( path ) return { 'mp':M.mecab(), 'db':db, 'tags':unicode(tags), 'bs':bs, 'ed':ed }
def post( st ): import morphemes as m mp = m.mecab( None ) ms = m.getMorphemes( mp, st['txt'], bs=st['bs'] ) mp.kill() txt = m.ms2str( ms ).decode('utf-8') kdb = m.loadDb( util.knownDbPath ) newMs = [ x for x in ms if x not in kdb ] newTxt = m.ms2str( newMs ).decode('utf-8') txt = '-----All-----\n' + txt + '\n-----New-----\n' + newTxt QMessageBox.information( st['ed'], 'Morphemes', txt )
def pre(ed): path = QFileDialog.getOpenFileName(caption='Open db', directory=util.knownDbPath) if not path: return 'BAIL' bs = util.getBlacklist(ed) db = M.MorphDb(path).db return { 'mp': M.mecab(None), 'fmmap': {}, 'mfmap': {}, 'db': db, 'bs': bs, 'ed': ed }
def pre(ed): field, ok = QInputDialog.getText(ed, 'Enter name of field to extract from', 'Field Name', QLineEdit.Normal, 'Expression') if not ok: return 'BAIL' defPath = util.dbPath + 'mySelection.db' path = QFileDialog.getSaveFileName(caption='Save morpheme db to?', directory=defPath) if not path: return 'BAIL' return { 'ed': ed, 'fieldName': field, 'filePath': str(path), 'db': M.MorphDb(), 'mp': M.mecab() }
def pre(ed): tags, ok = QInputDialog.getText(ed, 'Enter Tags', 'Tags', QLineEdit.Normal, 'myMorph') if not ok or not tags: return 'BAIL' path = QFileDialog.getOpenFileName(caption='Open db', directory=util.knownDbPath) if not path: return 'BAIL' bs = util.getBlacklist(ed) db = M.MorphDb(path) return { 'mp': M.mecab(), 'db': db, 'tags': unicode(tags), 'bs': bs, 'ed': ed }
def run(duelingSubsPath, outputSubsPath, whitelist, blacklist, matureFmt, knownFmt, unknownFmt): # Load files kdb = M.MorphDb(knownDbPath) mdb = M.MorphDb(matureDbPath) subFileLines = codecs.open(duelingSubsPath, 'r', 'utf-8').readlines() # Start Mecab mp = M.mecab() # Get dueling subs dialogueLines = [l for l in subFileLines if l.startswith(u'Dialogue')] header = subFileLines[:subFileLines.index(dialogueLines[0])] assert len( dialogueLines) % 2 == 0, 'Should be an even number of dialogue lines' lines = [] for i in xrange(0, len(dialogueLines), 2): jpn, eng = dialogueLines[i:i + 2] jpn, eng, pre = getText(jpn), getText(eng), getPreText(jpn) # get unknowns ms = M.getMorphemes(mp, jpn, ws=whitelist, bs=blacklist) unknowns, N_k = getNotInDb(ms, kdb.db) unmatures, N_m = getNotInDb(ms, mdb.db) d = { 'jpn': jpn, 'eng': eng, 'N_k': N_k, 'N_m': N_m, 'unknowns': unknowns, 'unmatures': unmatures } if N_m == 0: lines.append(pre + matureFmt % d) elif N_k == 0: lines.append(pre + knownFmt % d) else: lines.append(pre + unknownFmt % d) outFile = codecs.open(outputSubsPath, 'w', 'utf-8') outFile.write(u''.join(header)) outFile.write(u'\n'.join(lines)) outFile.close()
def pre( ed ): if not util.requireKnownDb(): return 'BAIL' kdb = M.loadDb( util.knownDbPath ) rdb = R.mkRankDb( kdb ) return { 'rdb':rdb, 'mp':M.mecab(None) }
def pre( ed ): if not util.requireKnownDb(): return 'BAIL' bs = util.getBlacklist( ed ) return { 'ed':ed, 'txt':'', 'bs':bs, 'mp':M.mecab() }
def mkAll( self ): # IO () log( 'Getting initial all.db...' ) if not hasattr( self, '_allDb' ): try: self._allDb = M.MorphDb( self.allPath ) debug( ' * Updating existing all.db' ) except IOError: self._allDb = M.MorphDb() debug( ' * Creating new all.db from scratch' ) allDb = self._allDb log( '...done' ) mp = M.mecab() # pre-cache lookups fieldNames = self.cfg['morph fields'] whitelist, blacklist = parseWhitelist( self.cfg['whitelist'] ), parseWhitelist( self.cfg['blacklist'] ) fid2cardsDb = self.fid2cardsDb() fidDb = allDb.fidDb() locDb = allDb.locDb() fs = self.getFacts() i, lfs = 0, len( fs ) start = time.time() last = time.time() for f in fs: mats = [ c.interval for c in fid2cardsDb[ f.id ] ] for fieldName in fieldNames: try: fieldValue = normalizeFieldValue( f[ fieldName ] ) except KeyError: # if fact doesn't have the field just skip it continue try: # existing location loc = fidDb[ (f.id, fieldName) ] # new loc only; no morpheme change if loc.fieldValue == fieldValue and loc.maturities != mats: debug(' .mats for %d[%s]' % ( f.id, fieldName ) ) newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = locDb.pop( loc ) locDb[ newLoc ] = ms # new loc and new morphemes elif loc.fieldValue != fieldValue: debug(' .morphs for %d[%s]' % ( f.id, fieldName ) ) newLoc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist ) locDb.pop( loc ) locDb[ newLoc ] = ms except KeyError: # new location loc = M.AnkiDeck( f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats ) ms = M.getMorphemes( mp, fieldValue, ws=whitelist, bs=blacklist ) if ms: debug(' .loc for %d[%s]' % ( f.id, fieldName ) ) locDb[ loc ] = ms i += 1 if i % 100 == 0: log(' %d / %d = %d%% in %f sec' % ( i, lfs, 100.*i/lfs, time.time()-last ) ) last = time.time() log( 'Proccessed all facts in %f sec. Now saving...' % ( time.time()-start ) ) allDb.clear() allDb.addFromLocDb( locDb ) allDb.save( self.allPath ) self.cfg['last db update'][ self.allPath ] = time.time() self.cfg['last all.db update took'] = time.time() - start log( '...done' ) sigterm( mp ) return self._allDb
def pre( ed ): field, ok = QInputDialog.getText( ed, 'Enter name of field to extract from', 'Field Name', QLineEdit.Normal, 'Expression' ) if not ok: return 'BAIL' defPath = util.dbPath + 'mySelection.db' path = QFileDialog.getSaveFileName( caption='Save morpheme db to?', directory=defPath ) if not path: return 'BAIL' return { 'ed':ed, 'fieldName':field, 'filePath':str(path), 'db':M.MorphDb(), 'mp':M.mecab() }
def mkAll(self): # IO () log('Getting initial all.db...') if not hasattr(self, '_allDb'): try: self._allDb = M.MorphDb(self.allPath) debug(' * Updating existing all.db') except IOError: self._allDb = M.MorphDb() debug(' * Creating new all.db from scratch') allDb = self._allDb log('...done') mp = M.mecab() # pre-cache lookups fieldNames = self.cfg['morph fields'] whitelist, blacklist = self.cfg['whitelist'], self.cfg['blacklist'] fid2cardsDb = self.fid2cardsDb() fidDb = allDb.fidDb() locDb = allDb.locDb() fs = self.getFacts() i, lfs = 0, len(fs) start = time.time() last = time.time() for f in fs: mats = [c.interval for c in fid2cardsDb[f.id]] for fieldName in fieldNames: try: fieldValue = normalizeFieldValue(f[fieldName]) except KeyError: # if fact doesn't have the field just skip it continue try: # existing location loc = fidDb[(f.id, fieldName)] # new loc only; no morpheme change if loc.fieldValue == fieldValue and loc.maturities != mats: debug(' .mats for %d[%s]' % (f.id, fieldName)) newLoc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = locDb.pop(loc) locDb[newLoc] = ms # new loc and new morphemes elif loc.fieldValue != fieldValue: debug(' .morphs for %d[%s]' % (f.id, fieldName)) newLoc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = M.getMorphemes(mp, fieldValue, ws=whitelist, bs=blacklist) locDb.pop(loc) locDb[newLoc] = ms except KeyError: # new location loc = M.AnkiDeck(f.id, fieldName, fieldValue, self.deckPath, self.deckName, mats) ms = M.getMorphemes(mp, fieldValue, ws=whitelist, bs=blacklist) if ms: debug(' .loc for %d[%s]' % (f.id, fieldName)) locDb[loc] = ms i += 1 if i % 100 == 0: log(' %d / %d = %d%% in %f sec' % (i, lfs, 100. * i / lfs, time.time() - last)) last = time.time() log('Proccessed all facts in %f sec. Now saving...' % (time.time() - start)) allDb.clear() allDb.addFromLocDb(locDb) allDb.save(self.allPath) self.cfg['last db update'][self.allPath] = time.time() self.cfg['last all.db update took'] = time.time() - start log('...done') sigterm(mp) return self._allDb
def pre( ed ): if not util.requireKnownDb(): return 'BAIL' bs = util.getBlacklist( ed ) return { 'bs':bs, 'kdb': m.loadDb( util.knownDbPath ), 'mp':m.mecab(None) }