def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) if chunkSize == None: chunkSize = pubStore.guessChunkSize(outDir) assert (chunkSize != None) # build into temporary dir, fail if it exists # it should not exist, otherwise something is wrong finalOutDir = outDir #outDir = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId)) buildDir = join(outDir, "build") os.mkdir(buildDir) inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles) != 0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles) == 0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = indexFilename + ".tmp.split" chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab") submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir) pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system write first to temporary dir, and copy over at end of all jobs """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) if chunkSize==None: chunkSize = pubStore.guessChunkSize(outDir) assert(chunkSize!=None) # build into temporary dir, fail if it exists # it should not exist, otherwise something is wrong finalOutDir= outDir #outDir = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId)) buildDir = join(outDir, "build") os.mkdir(buildDir) inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles)!=0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles)==0: logging.info("All updates done, not converting anything") os.rmdir(buildDir) return None indexFilename = join(buildDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkSize) indexSplitDir = indexFilename+".tmp.split" chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab") submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir) pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
def createChunksSubmitJobs(inDir, outDir, minId, chunkCount, runner): """ convert Consyn ZIP files from inDir to outDir split files into chunks and submit chunks to cluster system """ maxCommon.mustExistDir(outDir) updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId) chunkSize = pubStore.guessChunkSize(outDir) finalOutDir= outDir outDir = tempfile.mktemp(dir = outDir, prefix = "temp.pubConvElsevier.update.") os.mkdir(outDir) chunkCount = None inFiles = os.listdir(inDir) inFiles = [x for x in inFiles if x.endswith(".ZIP")] # keep order of input of input files for first run if len(alreadyDoneFiles)!=0: processFiles = set(inFiles).difference(alreadyDoneFiles) else: processFiles = inFiles if len(processFiles)==0: logging.info("All updates done, not converting anything") return None indexFilename = join(outDir, "%d_index.tab" % updateId) maxArticleId = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkCount, chunkSize) indexSplitDir = indexFilename+".tmp.split" pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir) submitJobs(runner, indexSplitDir, outDir) pubStore.moveFiles(outDir, finalOutDir) shutil.rmtree(outDir) if isdir(indexSplitDir): # how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)