def buildMscPureMatrix(lang): if os.path.exists(ARTS_FILE): logging.warning( "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE) else: logging.info("creating MSC database from meta.xml files") createMscsDb( ) # only run this when collection changes (test for file existence / delete file explicitly?) arts = [ art for art in docsim.getArts( ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == 'any' ] cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False))) rcats = utils_dml.reverseMap(cats) saveMsc2Id(cats, lang) logging.info("building MSC binary matrix") resultBin = numpy.zeros( (len(cats), len(cats)), dtype=numpy.float32 ) # binary msc similarity (with fixed msc hierarchy = identity) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj if idi == idj: resultBin[idi, idj] = 1.0 else: resultBin[idi, idj] = 0.0 matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False) return resultBin
def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix = 'gensim_' + language) arts = [Article.Article(rec) for rec in db.db if rec['language'] == language] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids] logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts
def buildMscPureMatrix(lang): if os.path.exists(ARTS_FILE): logging.warning( "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE ) else: logging.info("creating MSC database from meta.xml files") createMscsDb() # only run this when collection changes (test for file existence / delete file explicitly?) arts = [ art for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False) if art.language == lang or lang == "any" ] cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False))) rcats = utils_dml.reverseMap(cats) saveMsc2Id(cats, lang) logging.info("building MSC binary matrix") resultBin = numpy.zeros( (len(cats), len(cats)), dtype=numpy.float32 ) # binary msc similarity (with fixed msc hierarchy = identity) for idi, cati in cats.iteritems(): for idj, catj in cats.iteritems(): # print idi, cati, idj, catj if idi == idj: resultBin[idi, idj] = 1.0 else: resultBin[idi, idj] = 0.0 matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False) return resultBin
def getArts(fname, minFreq): db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open') import ipyutils language = 'eng' ipyutils.loadDicts(prefix='gensim_' + language) arts = [ Article.Article(rec) for rec in db.db if rec['language'] == language ] for art in arts: art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc)) logging.info('loaded %i articles from %s' % (len(arts), fname)) arts = [ art for art in arts if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids ] logging.info( 'extracted %i articles with exactly one MSC and non-empty body' % (len(arts))) okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True) mscs.printStats(arts) for art in arts: art.body = art.body.decode('utf8') # art.body = art.body.decode('ascii', 'ignore') art.body = art.body.translate(BAD_MAP).encode('utf8') if art.title: art.title = art.title.decode('utf8') # art.title = art.title.decode('ascii', 'ignore') art.title = art.title.translate(BAD_MAP).encode('utf8') art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]] arts = [art for art in arts if art.msc[0] in okmsc] allmsc, mainmsc = mscs.getMSCCnts(arts) for msc, mscarts in allmsc.iteritems(): logging.info("class %s: %i articles" % (msc, len(mscarts))) logging.info("======================") logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues())) logging.debug( 'using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq)) return arts