def normalize(self, key, value): v = value.replace('\ufeff', '') # FIXME utf-16 issue if v != value: # TODO can we decouple encoding from value normalization? message = f"encoding feff error in '{self.path}'" log.error(message) self.addError(exc.EncodingError(message)) if v.lower().strip() not in ('n/a', 'na', 'no'): # FIXME explicit null vs remove from structure yield from getattr(self, key, self.default)(v)
def normalize(self, rows): # FIXME need to detect changes # this removes any columns that are all dead #if any(not(c) for c in rows[0]): # doesn't work because generators error = exc.EncodingError(f"encoding feff error in '{self.path}'") cleaned_rows = zip(*(t for t in zip(*rows) if not all(not(e) for e in t))) # TODO check perf here for row in cleaned_rows: n_row = [c.strip().replace('\ufeff', '') for c in row if (not self.addError(error) # FIXME will probably append multiple ... if '\ufeff' in c else True)] if not all(not(c) for c in n_row): # skip totally empty rows yield n_row
def csv(self, delimiter=','): for encoding in ('utf-8', 'latin-1'): try: with open(self.path, 'rt', encoding=encoding) as f: for row in csv.reader(f, delimiter=delimiter): if row: yield row else: message = f'empty row in {self.path.as_posix()!r}' self.addError(message) logd.error(message) if encoding != 'utf-8': message = f'encoding bad {encoding!r} {self.path.as_posix()!r}' self.addError(exc.EncodingError(message)) logd.error(message) return except UnicodeDecodeError: continue
def xlsx(self): kwargs = { 'delimiter' : '\t', 'skip_empty_lines' : True, 'outputencoding': 'utf-8', } sheetid = 1 xlsx2csv = Xlsx2csv(self.path.as_posix(), **kwargs) ns = len(xlsx2csv.workbook.sheets) if ns > 1: message = f'too many sheets ({ns}) in {self.path.as_posix()!r}' self.addError(exc.EncodingError(message)) logd.error(message) f = io.StringIO() try: xlsx2csv.convert(f, sheetid) f.seek(0) gen = csv.reader(f, delimiter='\t') yield from gen except SheetNotFoundException as e: log.warning(f'Sheet weirdness in{self.path}') log.warning(str(e))