Exemplo n.º 1
0
def splitTabFileOnChunkId(filename, outDir, chunkSize=None, chunkCount=None):
    """ 
    use the chunkId field of a tab-sep file as the output filename.
    if chunkSize is specified, ignore the chunkId field and make sure that each piece
    has chunkSize lines.
    """
    if isdir(outDir):
        logging.info("Deleting %s" % outDir)
        shutil.rmtree(outDir)

    if not os.path.isdir(outDir):
        logging.info("Creating directory %s" % outDir)
        os.makedirs(outDir)
    maxCommon.mustBeEmptyDir(outDir)

    # read data into data dict and split by "chunkId" field
    headerLine = open(filename).readline()
    logging.info("Reading %s, splitting into pieces" % filename)
    data = {}
    i = 0
    for row in maxCommon.iterTsvRows(filename, encoding=None):
        if chunkSize==None and chunkCount==None:
            chunkId = row.chunkId
        elif chunkSize!=None:
            chunkId = "%05d" % (i / chunkSize)
        elif chunkCount!=None:
            chunkId = "%05d" % (i % chunkSize)
        data.setdefault(str(chunkId), []).append("\t".join(row)+"\n")
        i += 1

    # write to outDir
    logging.info("Splitting file data, Writing to %d files in %s/xxxx.tgz" % (len(data), outDir))
    pm = maxCommon.ProgressMeter(len(data))
    for chunkIdString, lines in data.iteritems():
        outfname = os.path.join(outDir, chunkIdString)
        logging.debug("Writing to %s" % outfname)
        fh = open(outfname, "w")
        fh.write(headerLine)
        for line in lines:
            fh.write(line)
        fh.close()
        pm.taskCompleted()

    return data.keys()
Exemplo n.º 2
0
def filterCmd(inSpec, searchSpec, outSpec, options):
    outDir = pubConf.resolveTextDir(outSpec)
    assert(outDir!=None)
    maxCommon.mustBeEmptyDir(outDir)
    return submitJobs(inSpec, searchSpec, outDir)
Exemplo n.º 3
0
def filterCmd(inSpec, searchSpec, outSpec, options):
    outDir = pubConf.resolveTextDir(outSpec)
    assert (outDir != None)
    maxCommon.mustBeEmptyDir(outDir)
    return submitJobs(inSpec, searchSpec, outDir)