Exemplo n.º 1
0
def buildMscPureMatrix(lang):
    if os.path.exists(ARTS_FILE):
        logging.warning(
            "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?"
            % ARTS_FILE)
    else:
        logging.info("creating MSC database from meta.xml files")
        createMscsDb(
        )  # only run this when collection changes (test for file existence / delete file explicitly?)
    arts = [
        art for art in docsim.getArts(
            ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == 'any'
    ]

    cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False)))
    rcats = utils_dml.reverseMap(cats)
    saveMsc2Id(cats, lang)

    logging.info("building MSC binary matrix")
    resultBin = numpy.zeros(
        (len(cats), len(cats)), dtype=numpy.float32
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            if idi == idj:
                resultBin[idi, idj] = 1.0
            else:
                resultBin[idi, idj] = 0.0
    matutils.saveMatrix(resultBin,
                        common.matrixFile("mscs_bin_%s.mm" % lang),
                        sparse=False)
    return resultBin
def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode = 'open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix = 'gensim_' + language)
    arts = [Article.Article(rec) for rec in db.db if rec['language'] == language]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix = 2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [art for art in arts if len(art.msc)==1 and art.body and art.id_int in ipyutils.rdocids]
    logging.info('extracted %i articles with exactly one MSC and non-empty body' % (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains = True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
#        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
    #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix = 2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" % sum(len(mscarts) for mscarts in allmsc.itervalues()))
        
    logging.debug('using %i articles from all %s msc classes that are covered by at least %i articles' % (len(arts), sorted(list(okmsc)), minFreq))
    return arts
def buildMscPureMatrix(lang):
    if os.path.exists(ARTS_FILE):
        logging.warning(
            "SKIPPING creating MSC database from meta.xml files (using old %s); is this what you want?" % ARTS_FILE
        )
    else:
        logging.info("creating MSC database from meta.xml files")
        createMscsDb()  # only run this when collection changes (test for file existence / delete file explicitly?)
    arts = [
        art
        for art in docsim.getArts(ARTS_FILE, acceptNoBody=True, acceptNoMsc=False)
        if art.language == lang or lang == "any"
    ]

    cats = dict(enumerate(mscs.getFreqMSCs(arts, minFreq=1, useMains=False)))
    rcats = utils_dml.reverseMap(cats)
    saveMsc2Id(cats, lang)

    logging.info("building MSC binary matrix")
    resultBin = numpy.zeros(
        (len(cats), len(cats)), dtype=numpy.float32
    )  # binary msc similarity (with fixed msc hierarchy = identity)
    for idi, cati in cats.iteritems():
        for idj, catj in cats.iteritems():
            #            print idi, cati, idj, catj
            if idi == idj:
                resultBin[idi, idj] = 1.0
            else:
                resultBin[idi, idj] = 0.0
    matutils.saveMatrix(resultBin, common.matrixFile("mscs_bin_%s.mm" % lang), sparse=False)
    return resultBin
Exemplo n.º 4
0
def getArts(fname, minFreq):
    db = ArticleDB.ArticleDB(common.dataFile(fname), mode='open')
    import ipyutils
    language = 'eng'
    ipyutils.loadDicts(prefix='gensim_' + language)
    arts = [
        Article.Article(rec) for rec in db.db if rec['language'] == language
    ]
    for art in arts:
        art.msc = list(set(mscs.niceMSC(msc, prefix=2)[0] for msc in art.msc))
    logging.info('loaded %i articles from %s' % (len(arts), fname))
    arts = [
        art for art in arts
        if len(art.msc) == 1 and art.body and art.id_int in ipyutils.rdocids
    ]
    logging.info(
        'extracted %i articles with exactly one MSC and non-empty body' %
        (len(arts)))
    okmsc = mscs.getFreqMSCs(arts, minFreq, useMains=True)
    mscs.printStats(arts)
    for art in arts:
        art.body = art.body.decode('utf8')
        #        art.body = art.body.decode('ascii', 'ignore')
        art.body = art.body.translate(BAD_MAP).encode('utf8')
        if art.title:
            art.title = art.title.decode('utf8')
            #        art.title = art.title.decode('ascii', 'ignore')
            art.title = art.title.translate(BAD_MAP).encode('utf8')
        art.msc = [mscs.niceMSC(art.msc[0], prefix=2)[0]]
    arts = [art for art in arts if art.msc[0] in okmsc]
    allmsc, mainmsc = mscs.getMSCCnts(arts)
    for msc, mscarts in allmsc.iteritems():
        logging.info("class %s: %i articles" % (msc, len(mscarts)))
    logging.info("======================")
    logging.info("sum: %i articles" %
                 sum(len(mscarts) for mscarts in allmsc.itervalues()))

    logging.debug(
        'using %i articles from all %s msc classes that are covered by at least %i articles'
        % (len(arts), sorted(list(okmsc)), minFreq))
    return arts