def _csv2rec(fname, comments='#', skiprows=0, checkrows=0, delimiter=',', converterd=None, names=None, missing='', missingd=None, use_mrecords=False, dayfirst=False, yearfirst=False): """ Load data from comma/space/tab delimited file in *fname* into a numpy record array and return the record array. If *names* is *None*, a header row is required to automatically assign the recarray names. The headers will be lower cased, spaces will be converted to underscores, and illegal attribute name characters removed. If *names* is not *None*, it is a sequence of names to use for the column names. In this case, it is assumed there is no header row. - *fname*: can be a filename or a file handle. Support for gzipped files is automatic, if the filename ends in '.gz' - *comments*: the character used to indicate the start of a comment in the file, or *None* to switch off the removal of comments - *skiprows*: is the number of rows from the top to skip - *checkrows*: is the number of rows to check to validate the column data type. When set to zero all rows are validated. - *converterd*: if not *None*, is a dictionary mapping column number or munged column name to a converter function. - *names*: if not None, is a list of header names. In this case, no header will be read from the file - *missingd* is a dictionary mapping munged column names to field values which signify that the field does not contain actual data and should be masked, e.g., '0000-00-00' or 'unused' - *missing*: a string whose value signals a missing field regardless of the column it appears in - *use_mrecords*: if True, return an mrecords.fromrecords record array if any of the data are missing - *dayfirst*: default is False so that MM-DD-YY has precedence over DD-MM-YY. See http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47 for further information. - *yearfirst*: default is False so that MM-DD-YY has precedence over YY-MM-DD. See http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47 for further information. If no rows are found, *None* is returned """ if converterd is None: converterd = dict() if missingd is None: missingd = {} import dateutil.parser import datetime fh = cbook.to_filehandle(fname) delimiter = str(delimiter) class FH: """ For space-delimited files, we want different behavior than comma or tab. Generally, we want multiple spaces to be treated as a single separator, whereas with comma and tab we want multiple commas to return multiple (empty) fields. The join/strip trick below effects this. """ def __init__(self, fh): self.fh = fh def close(self): self.fh.close() def seek(self, arg): self.fh.seek(arg) def fix(self, s): return ' '.join(s.split()) def __next__(self): return self.fix(next(self.fh)) def __iter__(self): for line in self.fh: yield self.fix(line) if delimiter == ' ': fh = FH(fh) reader = csv.reader(fh, delimiter=delimiter) def process_skiprows(reader): if skiprows: for i, row in enumerate(reader): if i >= (skiprows - 1): break return fh, reader process_skiprows(reader) def ismissing(name, val): "Should the value val in column name be masked?" return val == missing or val == missingd.get(name) or val == '' def with_default_value(func, default): def newfunc(name, val): if ismissing(name, val): return default else: return func(val) return newfunc def mybool(x): if x == 'True': return True elif x == 'False': return False else: raise ValueError('invalid bool') dateparser = dateutil.parser.parse def mydateparser(x): # try and return a datetime object d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst) return d mydateparser = with_default_value(mydateparser, datetime.datetime(1, 1, 1)) myfloat = with_default_value(float, np.nan) myint = with_default_value(int, -1) mystr = with_default_value(str, '') mybool = with_default_value(mybool, None) def mydate(x): # try and return a date object d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst) if d.hour > 0 or d.minute > 0 or d.second > 0: raise ValueError('not a date') return d.date() mydate = with_default_value(mydate, datetime.date(1, 1, 1)) def get_func(name, item, func): # promote functions in this order funcs = [mybool, myint, myfloat, mydate, mydateparser, mystr] for func in funcs[funcs.index(func):]: try: func(name, item) except Exception: continue return func raise ValueError('Could not find a working conversion function') # map column names that clash with builtins -- TODO - extend this list itemd = { 'return': 'return_', 'file': 'file_', 'print': 'print_', } def get_converters(reader, comments): converters = None i = 0 for row in reader: if (len(row) and comments is not None and row[0].startswith(comments)): continue if i == 0: converters = [mybool] * len(row) if checkrows and i > checkrows: break i += 1 for j, (name, item) in enumerate(zip(names, row)): func = converterd.get(j) if func is None: func = converterd.get(name) if func is None: func = converters[j] if len(item.strip()): func = get_func(name, item, func) else: # how should we handle custom converters and defaults? func = with_default_value(func, None) converters[j] = func return converters # Get header and remove invalid characters needheader = names is None if needheader: for row in reader: if (len(row) and comments is not None and row[0].startswith(comments)): continue headers = row break # remove these chars delete = set(r"""~!@#$%^&*()-=+~\|}[]{';: /?.>,<""") delete.add('"') names = [] seen = dict() for i, item in enumerate(headers): item = item.strip().lower().replace(' ', '_') item = ''.join([c for c in item if c not in delete]) if not len(item): item = 'column%d' % i item = itemd.get(item, item) cnt = seen.get(item, 0) if cnt > 0: names.append(item + '_%d' % cnt) else: names.append(item) seen[item] = cnt + 1 else: if isinstance(names, str): names = [n.strip() for n in names.split(',')] # get the converter functions by inspecting checkrows converters = get_converters(reader, comments) if converters is None: raise ValueError('Could not find any valid data in CSV file') # reset the reader and start over fh.seek(0) reader = csv.reader(fh, delimiter=delimiter) process_skiprows(reader) if needheader: while True: # skip past any comments and consume one line of column header row = next(reader) if (len(row) and comments is not None and row[0].startswith(comments)): continue break # iterate over the remaining rows and convert the data to date # objects, ints, or floats as appropriate rows = [] rowmasks = [] for i, row in enumerate(reader): if not len(row): continue if comments is not None and row[0].startswith(comments): continue # Ensure that the row returned always has the same nr of elements row.extend([''] * (len(converters) - len(row))) rows.append([ func(name, val) for func, name, val in zip(converters, names, row) ]) rowmasks.append( [ismissing(name, val) for name, val in zip(names, row)]) fh.close() if not len(rows): return None if use_mrecords and np.any(rowmasks): r = np.ma.mrecords.fromrecords(rows, names=names, mask=rowmasks) else: r = np.rec.fromrecords(rows, names=names) return r
def _csv2rec(fname, comments='#', skiprows=0, checkrows=0, delimiter=',', converterd=None, names=None, missing='', missingd=None, use_mrecords=False, dayfirst=False, yearfirst=False): """ Load data from comma/space/tab delimited file in *fname* into a numpy record array and return the record array. If *names* is *None*, a header row is required to automatically assign the recarray names. The headers will be lower cased, spaces will be converted to underscores, and illegal attribute name characters removed. If *names* is not *None*, it is a sequence of names to use for the column names. In this case, it is assumed there is no header row. - *fname*: can be a filename or a file handle. Support for gzipped files is automatic, if the filename ends in '.gz' - *comments*: the character used to indicate the start of a comment in the file, or *None* to switch off the removal of comments - *skiprows*: is the number of rows from the top to skip - *checkrows*: is the number of rows to check to validate the column data type. When set to zero all rows are validated. - *converterd*: if not *None*, is a dictionary mapping column number or munged column name to a converter function. - *names*: if not None, is a list of header names. In this case, no header will be read from the file - *missingd* is a dictionary mapping munged column names to field values which signify that the field does not contain actual data and should be masked, e.g., '0000-00-00' or 'unused' - *missing*: a string whose value signals a missing field regardless of the column it appears in - *use_mrecords*: if True, return an mrecords.fromrecords record array if any of the data are missing - *dayfirst*: default is False so that MM-DD-YY has precedence over DD-MM-YY. See http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47 for further information. - *yearfirst*: default is False so that MM-DD-YY has precedence over YY-MM-DD. See http://labix.org/python-dateutil#head-b95ce2094d189a89f80f5ae52a05b4ab7b41af47 for further information. If no rows are found, *None* is returned """ if converterd is None: converterd = dict() if missingd is None: missingd = {} import dateutil.parser import datetime fh = cbook.to_filehandle(fname) delimiter = str(delimiter) class FH: """ For space-delimited files, we want different behavior than comma or tab. Generally, we want multiple spaces to be treated as a single separator, whereas with comma and tab we want multiple commas to return multiple (empty) fields. The join/strip trick below effects this. """ def __init__(self, fh): self.fh = fh def close(self): self.fh.close() def seek(self, arg): self.fh.seek(arg) def fix(self, s): return ' '.join(s.split()) def __next__(self): return self.fix(next(self.fh)) def __iter__(self): for line in self.fh: yield self.fix(line) if delimiter == ' ': fh = FH(fh) reader = csv.reader(fh, delimiter=delimiter) def process_skiprows(reader): if skiprows: for i, row in enumerate(reader): if i >= (skiprows-1): break return fh, reader process_skiprows(reader) def ismissing(name, val): "Should the value val in column name be masked?" return val == missing or val == missingd.get(name) or val == '' def with_default_value(func, default): def newfunc(name, val): if ismissing(name, val): return default else: return func(val) return newfunc def mybool(x): if x == 'True': return True elif x == 'False': return False else: raise ValueError('invalid bool') dateparser = dateutil.parser.parse def mydateparser(x): # try and return a datetime object d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst) return d mydateparser = with_default_value(mydateparser, datetime.datetime(1, 1, 1)) myfloat = with_default_value(float, np.nan) myint = with_default_value(int, -1) mystr = with_default_value(str, '') mybool = with_default_value(mybool, None) def mydate(x): # try and return a date object d = dateparser(x, dayfirst=dayfirst, yearfirst=yearfirst) if d.hour > 0 or d.minute > 0 or d.second > 0: raise ValueError('not a date') return d.date() mydate = with_default_value(mydate, datetime.date(1, 1, 1)) def get_func(name, item, func): # promote functions in this order funcs = [mybool, myint, myfloat, mydate, mydateparser, mystr] for func in funcs[funcs.index(func):]: try: func(name, item) except Exception: continue return func raise ValueError('Could not find a working conversion function') # map column names that clash with builtins -- TODO - extend this list itemd = { 'return': 'return_', 'file': 'file_', 'print': 'print_', } def get_converters(reader, comments): converters = None i = 0 for row in reader: if (len(row) and comments is not None and row[0].startswith(comments)): continue if i == 0: converters = [mybool]*len(row) if checkrows and i > checkrows: break i += 1 for j, (name, item) in enumerate(zip(names, row)): func = converterd.get(j) if func is None: func = converterd.get(name) if func is None: func = converters[j] if len(item.strip()): func = get_func(name, item, func) else: # how should we handle custom converters and defaults? func = with_default_value(func, None) converters[j] = func return converters # Get header and remove invalid characters needheader = names is None if needheader: for row in reader: if (len(row) and comments is not None and row[0].startswith(comments)): continue headers = row break # remove these chars delete = set(r"""~!@#$%^&*()-=+~\|}[]{';: /?.>,<""") delete.add('"') names = [] seen = dict() for i, item in enumerate(headers): item = item.strip().lower().replace(' ', '_') item = ''.join([c for c in item if c not in delete]) if not len(item): item = 'column%d' % i item = itemd.get(item, item) cnt = seen.get(item, 0) if cnt > 0: names.append(item + '_%d' % cnt) else: names.append(item) seen[item] = cnt+1 else: if isinstance(names, str): names = [n.strip() for n in names.split(',')] # get the converter functions by inspecting checkrows converters = get_converters(reader, comments) if converters is None: raise ValueError('Could not find any valid data in CSV file') # reset the reader and start over fh.seek(0) reader = csv.reader(fh, delimiter=delimiter) process_skiprows(reader) if needheader: while True: # skip past any comments and consume one line of column header row = next(reader) if (len(row) and comments is not None and row[0].startswith(comments)): continue break # iterate over the remaining rows and convert the data to date # objects, ints, or floats as appropriate rows = [] rowmasks = [] for i, row in enumerate(reader): if not len(row): continue if comments is not None and row[0].startswith(comments): continue # Ensure that the row returned always has the same nr of elements row.extend([''] * (len(converters) - len(row))) rows.append([func(name, val) for func, name, val in zip(converters, names, row)]) rowmasks.append([ismissing(name, val) for name, val in zip(names, row)]) fh.close() if not len(rows): return None if use_mrecords and np.any(rowmasks): r = np.ma.mrecords.fromrecords(rows, names=names, mask=rowmasks) else: r = np.rec.fromrecords(rows, names=names) return r