def atexit_delete(filename): '''A utility function for threaded jobs started in CSVFixer to delete a given file. /filename/ Name of the file to delete. ''' try: os.unlink(filename) except Exception as ex: logger.warning('CSVFixer: Exception "{0}" deleting file "{1}".'.format(ex, filename))
def test_4_2(): jsodata = io.StringIO('{"a": 123}\n{"a": 45}') xout.re_init() try: p4 = Pipeline({ 'input-format': 'json', 'header': ['a'], 'dialect': 'nix', "dialects": { "nix": { "quoting": "QUOTE_ALL", "lineterminator": "\n" } } }) p4(jsodata, xout) except Exception as ex: logger.warning('{1}: {0} ({2})'.format(ex, type(ex).__name__, p4.dialect)) assert False # Expecting 3 lines: 123 / 45 / 2 (number of lines) assert xout.readlines() == [''.join(["123", "45", 2])]
def log_warn(self, msg): logger.warning(msg)
def task(cwd): '''Task as a gevent Greenlet that processes one file name pattern. /cwd/ Current working directory. ''' while True: # config = A dict containing configuration for the task; # pattern = Pattern to match for input file names. try: config, pattern = jobqu.get(timeout=10) except Empty: break if pattern == '' or config.get('disabled', False): logger.debug('CSVFixer: Ignore empty pattern or disabled task') continue pattern = config.get('pattern', pattern) # 'pattern' may be configured inside task dest = config.get('destination', cwd) linkfolder = config.get('link-folder') forge_path(dest) process = Pipeline(config) keep_times = config.get('times', False) rename = [ (re.compile(xk),xv) for xk,xv in config.get('rename', {}).items() ] logger.debug('CSVFixer: task = %s, destination = "%s"' % (pattern, dest)) for zipfn in glob.glob(pattern): stinfo = os.stat(zipfn) logger.debug('CSVFixer: Fixing file "{0}", mtime = {1}'.format( zipfn, time.strftime('%c', time.localtime(stinfo.st_mtime)))) if zipfn[-4:] != '.zip': ## Assume that it is a text CSV file if file name does not end with .zip: zipf = None ziplist = [zipfn] else: try: zipf = ZipFile(zipfn) ziplist = zipf.namelist() logger.debug('CSVFixer: Found list in zip file = %s' % (format(ziplist))) except BadZipfile: logger.warning('CSVFixer: zip file "%s" is bad.' % (zipfn)) continue fbasename = fwpath = '' for fn in ziplist: if fwpath == '' or config.get('file-mode') != 'a': fwname = fn for rex, fmt in rename: mx = rex.search(fwname) if mx: try: fwname = fmt.format(*mx.groups()) except Exception as ex: logger.warning('Exception fixing "{0}" with "{1}" and groups = {2}'.format(fn, fmt, mx.groups())) break fbasename = os.path.basename(fwname) fwpath = os.path.join(dest, fbasename) logger.debug('Processing file "{0}" to "{1}"'.format(fn, fwname)) lines = process(open(fn, 'r') if zipf is None else zipf.open(fn, 'r'), fwpath) logger.debug('{0} lines processed in file "{1}"'.format(lines, fn)) # Set fixed file's timestamps if so configured: if keep_times: os.utime(fwpath, (stinfo.st_mtime, stinfo.st_mtime)) logger.debug('Set file "{0}" atime and mtime to {1}'.format( fwpath, time.strftime('%c', time.localtime(stinfo.st_mtime)))) # Archive the .zip file if configured so if config.get('delete', False): logger.debug('File "%s" registered to be deleted' % (zipfn)) atexit.register(atexit_delete, zipfn) else: act = config.get('postprocess') if act != None: logger.debug('File "%s" registered to be postprocessed with "%s"' % (zipfn, act)) atexit.register(atexit_process, zipfn, act) # Delete empty file if so configured: if fwpath != '' and config.get('delete-empty', True) and os.stat(fwpath).st_size < 1: os.unlink(fwpath) logger.debug('Deleted empty output file "{0}"'.format(fwpath)) elif linkfolder: try: os.link(fwpath, os.path.join(linkfolder, fbasename)) except Exception as err: logger.error('Error link file "{0}" to folder {1}: {2}'.format(fwpath, linkfolder, err)) jobqu.task_done() logger.debug('Task "{0}" completed'.format(pattern))
def __call__(self, fnr, fnw): ''' Process given file and generate output. /fnr/ (Name of) file to read from. /fnw/ (Name of) file to write to. The /fnr/ file is read with a default csv.DictReader() as of now, or a JSOReader object if explicitly configured so. -- May need to revise to allow handling of CSV format variations. Returns number of rows (records) processed in the CSV file. ''' # Open files if they are given as file names: fin = csvio.Reader(open(fnr, 'r') if isinstance(fnr, str) else fnr, self.ends) fout = open(fnw, self.file_mode) if isinstance(fnw, str) else fnw write_header = self.write_header and (fout.tell() == 0) # Skip non-data if so configured: skip = dict({'line': 0, 'pass': 0, 'till': 0}) skip.update(self.skip) lineno = 0 while skip['more']: try: line = next(fin) if isinstance(line, bytes): line = line.decode('utf-8') except StopIteration: logger.warning('Unexpected end-of-file when skiping to data in {0}:{1}'.format(fnr, lineno)) return 0 if skip['till'] and skip['till'].match(line): logger.debug('Skip-till matching line {0}: {1}'.format(lineno+1, line)) fin.backup() break lineno += 1 if (skip['pass'] and skip['pass'].match(line)) or\ (skip['line'] and skip['line'] <= lineno): break logger.debug('Skipping line {0}: {1}'.format(lineno, line)) # If no header is configured, assuming the next line in /fin/ is it: # TBD: Make output header different than input header, optionally. rheader = None header = self.header logger.debug('{0}: {1}to output CSV header: {2}'.format(fnw, '' if write_header else 'not ', header)) if self.read_header or header is None: try: rheader = [ self.header_clean.sub('', x) for x in next(fin).split(',') ] except Exception as ex: logger.debug('{2}: {0} (lineno = {1})'.format(ex, lineno, type(ex).__name__)) logger.warning('Unexpected error when reading CSV header in {0}:{1}'.format(fnr, lineno)) return 0 lineno += 1 logger.debug('Read header: {0}'.format(rheader)) for rex, fmt in self.header_fix: nhdr = [] for col in rheader: mx = rex.match(col) if mx: try: col = fmt.format(*mx.groups()) except Exception as ex: logger.warning('Exception fixing "{0}" with "{1}" and groups = {2}'.format(col, fmt, mx.groups())) nhdr.append(col) rheader = nhdr logger.debug('Header fixed: {0}'.format(rheader)) # # Read through the input file and write out: Read error(s) are logged but ignored. # lineno = 0 with csvio.Writer(fout, header or rheader, write_header, self.dialect) as fw: # filters: Filters to pass data through. If missing, then straight thru. filter1 = fw try: for fltr in reversed(self.filters): modname, cname = fltr.rsplit('.') mod = __import__('c9r.util.filter.'+modname, fromlist=[cname]) klass = getattr(mod, cname) filter1 = klass(filter1).open() except ImportError: logger.warning('ImportError for filter {0}'.format(fltr)) raise csvreader = self.ireader(fin, fieldnames=(rheader or header)) while True: try: line = next(csvreader) lineno += 1 filter1.write(line) except StopIteration: break except Exception as ex: logger.warning('{2}: {0} (lineno = {1})'.format(ex, lineno, type(ex).__name__)) logger.debug('\tline = {0})'.format(line)) print('-'*60) traceback.print_exc(file=sys.stdout) print('-'*60) #logger.debug(traceback.format_tb(sys.exc_info())) if True: logger.debug('Closing filter 1: {0}, lines = {1}, fout size = {2}'.format(type(filter1).__name__, lineno, fout.tell())) filter1.close() return lineno