def splitTabFileOnChunkId(filename, outDir, chunkSize=None, chunkCount=None): """ use the chunkId field of a tab-sep file as the output filename. if chunkSize is specified, ignore the chunkId field and make sure that each piece has chunkSize lines. """ if isdir(outDir): logging.info("Deleting %s" % outDir) shutil.rmtree(outDir) if not os.path.isdir(outDir): logging.info("Creating directory %s" % outDir) os.makedirs(outDir) maxCommon.mustBeEmptyDir(outDir) # read data into data dict and split by "chunkId" field headerLine = open(filename).readline() logging.info("Reading %s, splitting into pieces" % filename) data = {} i = 0 for row in maxCommon.iterTsvRows(filename, encoding=None): if chunkSize==None and chunkCount==None: chunkId = row.chunkId elif chunkSize!=None: chunkId = "%05d" % (i / chunkSize) elif chunkCount!=None: chunkId = "%05d" % (i % chunkSize) data.setdefault(str(chunkId), []).append("\t".join(row)+"\n") i += 1 # write to outDir logging.info("Splitting file data, Writing to %d files in %s/xxxx.tgz" % (len(data), outDir)) pm = maxCommon.ProgressMeter(len(data)) for chunkIdString, lines in data.iteritems(): outfname = os.path.join(outDir, chunkIdString) logging.debug("Writing to %s" % outfname) fh = open(outfname, "w") fh.write(headerLine) for line in lines: fh.write(line) fh.close() pm.taskCompleted() return data.keys()
def filterCmd(inSpec, searchSpec, outSpec, options): outDir = pubConf.resolveTextDir(outSpec) assert(outDir!=None) maxCommon.mustBeEmptyDir(outDir) return submitJobs(inSpec, searchSpec, outDir)
def filterCmd(inSpec, searchSpec, outSpec, options): outDir = pubConf.resolveTextDir(outSpec) assert (outDir != None) maxCommon.mustBeEmptyDir(outDir) return submitJobs(inSpec, searchSpec, outDir)