def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): """ submit jobs to convert zip and disk files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs This is based on pubConvElsevier.py """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) assert(chunkSize!=None) finalOutDir= outDir outDir = tempfile.mktemp(dir = outDir, prefix = "springerUpdate.tmp.") os.mkdir(outDir) # getting filenames from the disk diskDir = join(inDir, "disk") if int(updateId)==0 and isdir(diskDir): inDiskFiles = parseDiskFnames(diskDir) else: logging.info("Not first update or no directory %s, not parsing files from springer disk" % diskDir) # getting filenames from the updates zipDir = join(inDir, "updates") inZipFiles = os.listdir(zipDir) inZipFiles = [x for x in inZipFiles if x.endswith(".zip")] logging.info("Found %d update zip files" % len(inZipFiles)) # keep order of input files for first run if len(alreadyDoneFiles)==0: processFiles = inDiskFiles+inZipFiles else: processFiles = set(inZipFiles).difference(alreadyDoneFiles) if len(processFiles)==0: logging.info("All updates done, not converting anything") return None else: logging.info("Total number of files to convert: %d" % (len(processFiles))) indexFilename = join(outDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(zipDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = join(outDir, "indexFiles") pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = concatDois(finalOutDir, outDir, "doneArticles.tab") submitJobs(runner, zipDir, indexSplitDir, idFname, outDir) pubGeneric.concatDelIdFiles(outDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(outDir, finalOutDir, "%d.log" % updateId) # cleanup, move over, remove whole temp dir if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(outDir, finalOutDir) shutil.rmtree(outDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
def runChunkMatrix(outDir, datasets): batchDir = join(pubConf.clusterBatchDir, "pubExpMatrix") maxCommon.mustExistDir(batchDir, makeDir=True) cluster = maxRun.Runner(batchDir=batchDir) for chunkName in pubStore.iterChunks(datasets): outFname = join(outDir, basename(chunkName)+".tab.gz") params = ["{check in exists %s}" % chunkName, "{check out exists %s}" % outFname] cluster.submitPythonFunc(__file__, "chunkMatrix", params) cluster.finish()
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir) #maxCommon.delOnExit(tmpDir) maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) # get all .gz.index files, remove the already done files inFnames = glob.glob(join(inDir, "*.index.gz")) inBaseNames = set([basename(x) for x in inFnames]) todoBasenames = inBaseNames - set(alreadyDoneFiles) todoFnames = [join(inDir, x) for x in todoBasenames] if len(todoFnames) == 0: logging.info("All input files already converted") return indexFilename = join(outDir, "%d_index.tab" % updateId) indexFile = open(indexFilename, "w") headers = ["articleId", "tsvFile", "url", "offset"] indexFile.write("\t".join(headers)) indexFile.write("\n") # read them and create a big index file: # with tsvname, url, offset numId = minId doneUrls = set() for fname in todoFnames: baseName = basename(fname) for line in gzip.open(fname): url, offset = line.rstrip("\n").split("\t") assert (offset.isdigit()) if "\t" in url or "\n" in url: logging.info("tab or NL in url %s, skipping" % url) continue if url in doneUrls: logging.info("Already did %s" % url) continue baseName = baseName.replace(".index.gz", ".gz") row = [str(numId), baseName, url, offset] indexFile.write("\t".join(row)) indexFile.write("\n") numId += 1 indexFile.close() # split the index file into chunks, one per job chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize) idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab") # submit one conversion job per chunk submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir) pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId) pubStore.moveFiles(tmpDir, outDir) shutil.rmtree(tmpDir) pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
def __init__(self, dataset): self.dataset = dataset self.textDir = pubConf.resolveTextDir(dataset) if self.textDir == None: raise Exception("dataset %s can not be resolved to a directory" % dataset) self.pubMapBaseDir = pubConf.pubMapBaseDir maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True) self._defineBatchDirectories()
def runChunkMatrix(outDir, datasets): batchDir = join(pubConf.clusterBatchDir, "pubExpMatrix") maxCommon.mustExistDir(batchDir, makeDir=True) cluster = maxRun.Runner(batchDir=batchDir) for chunkName in pubStore.iterChunks(datasets): outFname = join(outDir, basename(chunkName) + ".tab.gz") params = [ "{check in exists %s}" % chunkName, "{check out exists %s}" % outFname ] cluster.submitPythonFunc(__file__, "chunkMatrix", params) cluster.finish()
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir) #maxCommon.delOnExit(tmpDir) maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) # get all .gz.index files, remove the already done files inFnames = glob.glob(join(inDir, "*.index.gz")) inBaseNames = set([basename(x) for x in inFnames]) todoBasenames = inBaseNames - set(alreadyDoneFiles) todoFnames = [join(inDir, x) for x in todoBasenames] if len(todoFnames)==0: logging.info("All input files already converted") return indexFilename = join(outDir, "%d_index.tab" % updateId) indexFile = open(indexFilename, "w") headers = ["articleId", "tsvFile", "url", "offset"] indexFile.write("\t".join(headers)) indexFile.write("\n") # read them and create a big index file: # with tsvname, url, offset numId = minId doneUrls = set() for fname in todoFnames: baseName = basename(fname) for line in gzip.open(fname): url, offset = line.rstrip("\n").split("\t") assert(offset.isdigit()) if "\t" in url or "\n" in url: logging.info("tab or NL in url %s, skipping" % url) continue if url in doneUrls: logging.info("Already did %s" % url) continue baseName = baseName.replace(".index.gz", ".gz") row = [str(numId), baseName, url, offset] indexFile.write("\t".join(row)) indexFile.write("\n") numId+=1 indexFile.close() # split the index file into chunks, one per job chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize) idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab") # submit one conversion job per chunk submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir) pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId) pubStore.moveFiles(tmpDir, outDir) shutil.rmtree(tmpDir) pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) if chunkSize == None: chunkSize = pubStore.guessChunkSize(outDir) assert (chunkSize != None) # build into temporary dir, fail if it exists # it should not exist, otherwise something is wrong finalOutDir = outDir #outDir = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId)) buildDir = join(outDir, "build") os.mkdir(buildDir) inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles) != 0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles) == 0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = indexFilename + ".tmp.split" chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab") submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir) pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) if chunkSize==None: chunkSize = pubStore.guessChunkSize(outDir) assert(chunkSize!=None) # build into temporary dir, fail if it exists # it should not exist, otherwise something is wrong finalOutDir= outDir #outDir = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId)) buildDir = join(outDir, "build") os.mkdir(buildDir) inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles)!=0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles)==0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = indexFilename+".tmp.split" chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab") submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir) pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
def createChunksSubmitJobs(inDir, finalOutDir, runner, chunkSize): """ submit jobs to convert zip and disk files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs This is based on pubConvElsevier.py """ maxCommon.mustExistDir(finalOutDir) minId = pubConf.identifierStart["springer"] buildDir = pubGeneric.makeBuildDir(finalOutDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(finalOutDir, minId) assert chunkSize != None # getting filenames from the disk diskDir = join(inDir, "disk") if int(updateId) == 0 and isdir(diskDir): inDiskFiles = parseDiskFnames(diskDir) else: logging.info("Not first update or no directory %s, not parsing files from springer disk" % diskDir) # getting filenames from the updates zipDir = join(inDir, "updates") inZipFiles = os.listdir(zipDir) inZipFiles = [x for x in inZipFiles if x.endswith(".zip")] logging.info("Found %d update zip files" % len(inZipFiles)) # keep order of input files for first run if len(alreadyDoneFiles) == 0: processFiles = inDiskFiles + inZipFiles else: processFiles = set(inZipFiles).difference(alreadyDoneFiles) if len(processFiles) == 0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None else: logging.info("Total number of files to convert: %d" % (len(processFiles))) indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(zipDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = join(buildDir, "indexFiles") pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = concatDois(finalOutDir, buildDir, "doneArticles.tab") submitJobs(runner, zipDir, indexSplitDir, idFname, buildDir) finishUp(buildDir, finalOutDir)
def createChunksSubmitJobs(inDir, finalOutDir, runner, chunkSize): """ submit jobs to convert zip and disk files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs This is based on pubConvElsevier.py """ maxCommon.mustExistDir(finalOutDir) minId = pubConf.identifierStart["springer"] buildDir = pubGeneric.makeBuildDir(finalOutDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(finalOutDir, minId) assert(chunkSize!=None) # getting filenames from the disk diskDir = join(inDir, "disk") if int(updateId)==0 and isdir(diskDir): inDiskFiles = parseDiskFnames(diskDir) else: logging.info("Not first update or no directory %s, not parsing files from springer disk" % diskDir) # getting filenames from the updates zipDir = join(inDir, "updates") inZipFiles = os.listdir(zipDir) inZipFiles = [x for x in inZipFiles if x.endswith(".zip")] logging.info("Found %d update zip files" % len(inZipFiles)) # keep order of input files for first run if len(alreadyDoneFiles)==0: processFiles = inDiskFiles+inZipFiles else: processFiles = set(inZipFiles).difference(alreadyDoneFiles) if len(processFiles)==0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None else: logging.info("Total number of files to convert: %d" % (len(processFiles))) indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(zipDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = join(buildDir, "indexFiles") pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = concatDois(finalOutDir, buildDir, "doneArticles.tab") submitJobs(runner, zipDir, indexSplitDir, idFname, buildDir) finishUp(buildDir, finalOutDir)
def __init__(self, dataset): self.markerCountsBase = MARKERCOUNTSBASE self.markerDirBase = MARKERDIRBASE self.pubMapBaseDir = pubConf.pubMapBaseDir maxCommon.mustExistDir(pubConf.pubMapBaseDir, makeDir=True) self.dataset = dataset if "," in dataset: logging.debug("comma in dataset description, deferring config") return self.textDir = pubConf.resolveTextDir(dataset) if self.textDir==None: raise Exception("dataset %s can not be resolved to a directory" % dataset) self._defineBatchDirectories()
def newBatch(self, outDir, bundle): " create a new batch directory and save ourselves as JSON into it " # define the dir if self.batchId is None: self.batchId = 0 else: self.batchId = self.batchId+1 logging.debug("Increasing batchId, new batchId is %s" % self.batchId) self.batchDir = join(self.baseDirBatches, str(self.batchId)) # create the dir if isdir(self.batchDir): if not len(os.listdir(self.batchDir))==0: raise Exception("%s contains files, is this really a new run?" % self.batchDir) else: logging.debug("Creating dir %s" % self.batchDir) os.makedirs(self.batchDir) assert(outDir!=None and outDir!="") maxCommon.mustExistDir(outDir, makeDir=True) self.pubMapBaseDir = outDir logging.debug("Main pipeline outdir is %s" % outDir) self.bundleName = bundle self.datasets = pubConf.bundleToText[dataset] self.markerCountsBase = MARKERCOUNTSBASE self.markerDirBase = MARKERDIRBASE # base working directory for dataset self.baseDir = join(self.pubMapBaseDir, self.bundleName) self.batchId = self._findCurrentBatchDir() self.batchDir = join(self.baseDirBatches, str(self.batchId)) self._defineBatchDirectories() # populate text input updateIds self.updateIds = readUpdateIds(self.batchDir, self.datasets)
def __init__(self, dataset, outDir): self.markerCountsBase = MARKERCOUNTSBASE self.markerDirBase = MARKERDIRBASE assert(outDir!=None and outDir!="") self.pubMapBaseDir = outDir maxCommon.mustExistDir(self.pubMapBaseDir, makeDir=True) logging.debug("Main pipeline outdir is %s" % outDir) self.dataset = dataset if "," in dataset: logging.debug("comma in dataset description, deferring config") return self.textDir = pubConf.resolveTextDir(dataset) if self.textDir==None: raise Exception("dataset %s can not be resolved to a directory" % dataset) # base dir for dataset self.baseDir = join(self.pubMapBaseDir, self.dataset) self.batchId = self._findCurrentBatchDir() self.batchDir = join(self.baseDirBatches, str(self.batchId)) self._defineBatchDirectories()
def createChunksSubmitJobs(inDir, outDir, minId, chunkCount, runner): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) chunkSize = pubStore.guessChunkSize(outDir) finalOutDir= outDir outDir = tempfile.mktemp(dir = outDir, prefix = "temp.pubConvElsevier.update.") os.mkdir(outDir) chunkCount = None inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles)!=0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles)==0: logging.info("All updates done, not converting anything") return None indexFilename = join(outDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkCount, chunkSize) indexSplitDir = indexFilename+".tmp.split" pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) submitJobs(runner, indexSplitDir, outDir) pubStore.moveFiles(outDir, finalOutDir) shutil.rmtree(outDir) if isdir(indexSplitDir): # how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)