def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \ skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None): """ run jobs to convert the articles to a bag-of-words matrix """ assert (outFormat in ["svml", "arff", "pmidsvml"]) if isinstance(datasets, basestring): datasets = [datasets] if runner == None: runner = pubGeneric.makeClusterRunner(__file__) logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname)) if posPmidFname != None: posPmids = parsePmids(posPmidFname) if negPmidFname != None: negPmids = parsePmids(negPmidFname) termList = parseTerms(wordListFname) paramDict = {"termList" : termList, "posPmids" : posPmids, \ "negPmids" : negPmids, "outFormat" : outFormat } paramDict["docIdOutFname"] = docIdFname pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \ outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \ skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None): """ run jobs to convert the articles to a bag-of-words matrix """ assert (outFormat in ["svml", "arff", "pmidsvml"]) if isinstance(datasets, basestring): datasets = [datasets] if runner==None: runner = pubGeneric.makeClusterRunner(__file__) logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname)) if posPmidFname!=None: posPmids = parsePmids(posPmidFname) if negPmidFname!=None: negPmids = parsePmids(negPmidFname) termList = parseTerms(wordListFname) paramDict = {"termList" : termList, "posPmids" : posPmids, \ "negPmids" : negPmids, "outFormat" : outFormat } paramDict["docIdOutFname"] = docIdFname pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \ outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
def buildWordList(runner, datasets, skipMap, outFname): pubAlg.mapReduce(__file__+":WordCounter", datasets, {}, outFname, skipMap=skipMap, \ runTest=False, cleanUp=True, runner=runner)