def fragmentInputBySize(infile, tmpdir, chunk, fileType, fragmentBase, splitOnSize=True, **kwargs): """ Break up input into files of size chunk in tmpdir. Return number of fragments. """ logging.debug("Fragmenting input: %r" % ({ 'infile': infile, 'tmpDir': tmpdir, 'chunk': chunk, 'base': fragmentBase, 'kwargs': kwargs })) inhandle = openInputFile(infile) num = fragmentInputStreamBySize(inhandle, tmpdir, chunk, fileType, fragmentBase, splitOnSize=splitOnSize, **kwargs) if infile is not None: inhandle.close() return num
def fragmentInputBySize(infile, tmpdir, chunk, fileType, fragmentBase, splitOnSize=True, **kwargs): """ Break up input into files of size chunk in tmpdir. Return number of fragments. """ logging.debug( "Fragmenting input: %r" % ({ 'infile': infile, 'tmpDir': tmpdir, 'chunk': chunk, 'base': fragmentBase, 'kwargs': kwargs})) inhandle = openInputFile(infile) num = fragmentInputStreamBySize( inhandle, tmpdir, chunk, fileType, fragmentBase, splitOnSize=splitOnSize, **kwargs) if infile is not None: inhandle.close() return num
def getSizePerChunk(infile, splits, fileType, splitOnSize=False): """ Get total size of all records and return target size for each chunk to end up with number of chunks specified by 'splits' """ if infile is None: raise Exception("We cannot determine chunk size from STDIN!") if splitOnSize: # get a custom function that returns the size of this type of record recordSizer=fileType.sizer else: # just return 1 for each record recordSizer=recordCounter # loop through records inhandle = openInputFile(infile) totalSize = 0 for record in fileType.recordStreamer(inhandle): totalSize+=recordSizer(record) inhandle.close() return calculateChunkSize(totalSize,splits)
def __init__(self, fileName, *args): LineCounter.__init__(self, openInputFile(fileName, *args)) self.fileName=fileName
def __init__(self, fileName, *args): LineCounter.__init__(self, openInputFile(fileName, *args)) try: self.fileName = fileName.name except AttributeError: self.fileName = fileName