def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \ skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None): """ run jobs to convert the articles to a bag-of-words matrix """ assert (outFormat in ["svml", "arff", "pmidsvml"]) if isinstance(datasets, basestring): datasets = [datasets] if runner == None: runner = pubGeneric.makeClusterRunner(__file__) logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname)) if posPmidFname != None: posPmids = parsePmids(posPmidFname) if negPmidFname != None: negPmids = parsePmids(negPmidFname) termList = parseTerms(wordListFname) paramDict = {"termList" : termList, "posPmids" : posPmids, \ "negPmids" : negPmids, "outFormat" : outFormat } paramDict["docIdOutFname"] = docIdFname pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \ outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
def runMatrixJobs(outFname, datasets, wordListFname, posPmidFname, negPmidFname, \ skipMap, outFormat, onlyTest, docIdFname, posPmids=None, negPmids=None, runner=None): """ run jobs to convert the articles to a bag-of-words matrix """ assert (outFormat in ["svml", "arff", "pmidsvml"]) if isinstance(datasets, basestring): datasets = [datasets] if runner==None: runner = pubGeneric.makeClusterRunner(__file__) logging.debug("pos and neg pmid fnames are: %s, %s" % (posPmidFname, negPmidFname)) if posPmidFname!=None: posPmids = parsePmids(posPmidFname) if negPmidFname!=None: negPmids = parsePmids(negPmidFname) termList = parseTerms(wordListFname) paramDict = {"termList" : termList, "posPmids" : posPmids, \ "negPmids" : negPmids, "outFormat" : outFormat } paramDict["docIdOutFname"] = docIdFname pubAlg.mapReduce(__file__+":MatrixMaker", datasets, paramDict, \ outFname, skipMap=skipMap, runTest=True, runner=runner, onlyTest=onlyTest)
def getRunner(self, step): " return a runner object for the current dataset and pipelineStep" headNode = pubConf.stepHosts.get(step, None) logging.debug("Headnode for step %s is %s" % (step, headNode)) return pubGeneric.makeClusterRunner("pubMap-" + self.dataset + "-" + step, headNode=headNode)
def submitJobs(inSpec, filterSpec, outDir): inDirs = pubConf.resolveTextDirs(inSpec) runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, algName=inSpec) outFnames = [] for inDir in inDirs: inFnames = glob.glob(join(inDir, "*.articles.gz")) for inFname in inFnames: outFname = join(outDir, basename(dirname(inFname))+"-"+basename(inFname)) outFnames.append(outFname) outFnames.append(outFname.replace('.articles.gz','.files.gz')) #command = "%s %s filterJob {check in exists %s} %s %s" % \ #(sys.executable, __file__, inFname, pmidFname, outFname) runner.submitPythonFunc(__file__, "filterOneChunk", [inFname, filterSpec, outFname]) runner.finish(wait=True) return outFnames
def submitJobs(inSpec, filterSpec, outDir): inDirs = pubConf.resolveTextDirs(inSpec) runner = pubGeneric.makeClusterRunner(__file__, maxJob=pubConf.convertMaxJob, algName=inSpec) outFnames = [] for inDir in inDirs: inFnames = glob.glob(join(inDir, "*.articles.gz")) for inFname in inFnames: outFname = join( outDir, basename(dirname(inFname)) + "-" + basename(inFname)) outFnames.append(outFname) outFnames.append(outFname.replace('.articles.gz', '.files.gz')) #command = "%s %s filterJob {check in exists %s} %s %s" % \ #(sys.executable, __file__, inFname, pmidFname, outFname) runner.submitPythonFunc(__file__, "filterOneChunk", [inFname, filterSpec, outFname]) runner.finish(wait=True) return outFnames
def getRunner(self, step): " return a runner object for the current dataset and pipelineStep" headNode = pubConf.stepHosts.get(step, None) logging.debug("Headnode for step %s is %s" % (step, headNode)) return pubGeneric.makeClusterRunner("pubMap-"+self.dataset+"-"+step, headNode=headNode)