def read_json_lines(finput, logger=logging, encoding='utf-8'): ctx = joint_context(finput) \ if isiterable(finput) \ else open_gz(finput, 'r') with ctx as fhandle: for idx, line in enumerate(fhandle, start=1): try: obj = json.loads(line, encoding=encoding) except ValueError as err: logger.error("Could not parse line %d: %s", idx, err) continue yield obj
def read_json_lines(finput, logger=logging, show_progress=None): ctx = joint_context(finput) \ if isiterable(finput) \ else open_gz(finput, 'r') with ctx as fhandle: for idx, line in enumerate(fhandle, start=1): if show_progress and idx % show_progress == 0 and idx > 1: logger.info("Processed %d lines", idx) try: obj = json.loads(line) except ValueError as err: logger.error("Could not parse line %d: %s", idx, err) continue yield obj
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'): """Read a text resource ignoring comments beginning with pound sign :param finput: path or file handle :type finput: str, file :param encoding: which encoding to use (default: UTF-8) :type encoding: str :param ignore_prefix: lines matching this prefix will be skipped :type ignore_prefix: str, unicode :rtype: generator """ ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \ if isiterable(finput) \ else codecs.open(finput, 'r', encoding=encoding) with ctx as fhandle: for line in fhandle: if ignore_prefix is not None: line = line.split(ignore_prefix)[0] line = line.strip() if line: yield line