예제 #1
0
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \
        skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None):
    """ run jobs to convert the articles to a bag-of-words matrix """

    assert (outFormat in ["svml", "arff", "pmidsvml"])

    if isinstance(datasets, basestring):
        datasets = [datasets]

    if runner == None:
        runner = pubGeneric.makeClusterRunner(__file__)

    logging.debug("pos and neg pmid fnames are: %s, %s" %
                  (posPmidFname, negPmidFname))
    if posPmidFname != None:
        posPmids = parsePmids(posPmidFname)
    if negPmidFname != None:
        negPmids = parsePmids(negPmidFname)

    termList = parseTerms(wordListFname)

    paramDict = {"termList" : termList, "posPmids"  : posPmids, \
                 "negPmids" : negPmids, "outFormat" : outFormat }
    paramDict["docIdOutFname"] = docIdFname

    pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \
        outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
예제 #2
0
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \
        skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None):
    """ run jobs to convert the articles to a bag-of-words matrix """

    assert (outFormat in ["svml", "arff", "pmidsvml"])

    if isinstance(datasets, basestring):
        datasets = [datasets]

    if runner==None:
        runner = pubGeneric.makeClusterRunner(__file__)

    logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname))
    if posPmidFname!=None:
        posPmids = parsePmids(posPmidFname)
    if negPmidFname!=None:
        negPmids = parsePmids(negPmidFname)

    termList = parseTerms(wordListFname)

    paramDict = {"termList" : termList, "posPmids"  : posPmids, \
                 "negPmids" : negPmids, "outFormat" : outFormat }
    paramDict["docIdOutFname"] = docIdFname

    pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \
        outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
예제 #3
0
def buildWordList(runner, datasets, skipMap, outFname):
    pubAlg.mapReduce(__file__+":WordCounter", datasets, {}, outFname, skipMap=skipMap, \
        runTest=False, cleanUp=True, runner=runner)
예제 #4
0
def buildWordList(runner, datasets, skipMap, outFname):
    pubAlg.mapReduce(__file__+":WordCounter", datasets, {}, outFname, skipMap=skipMap, \
        runTest=False, cleanUp=True, runner=runner)