def _simple_parser(lines, colNames=None, header=0, index_col=0, na_values=None, date_parser=None, parse_dates=True): """ Workhorse function for processing nested list into DataFrame Should be replaced by np.genfromtxt eventually? """ if header is not None: columns = [] for i, c in enumerate(lines[header]): if c == '': columns.append('Unnamed: %d' % i) else: columns.append(c) content = lines[header+1:] counts = {} for i, col in enumerate(columns): cur_count = counts.get(col, 0) if cur_count > 0: columns[i] = '%s.%d' % (col, cur_count) counts[col] = cur_count + 1 else: ncols = len(lines[0]) if not colNames: columns = ['X.%d' % (i + 1) for i in range(ncols)] else: assert(len(colNames) == ncols) columns = colNames content = lines if len(content) == 0: # pragma: no cover if index_col is not None: if np.isscalar(index_col): index = Index([], name=columns.pop(index_col)) else: cp_cols = list(columns) names = [] for i in index_col: name = cp_cols[i] columns.remove(name) names.append(name) index = MultiIndex.fromarrays([[]] * len(index_col), names=names) else: index = Index([]) return DataFrame(index=index, columns=columns) # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', 'nan', '']) if na_values is None: na_values = NA_VALUES else: na_values = set(list(na_values)) | NA_VALUES zipped_content = list(lib.to_object_array(content).T) if index_col is None and len(content[0]) == len(columns) + 1: index_col = 0 # no index column specified, so infer that's what is wanted if index_col is not None: if np.isscalar(index_col): index = zipped_content.pop(index_col) if len(content[0]) == len(columns) + 1: name = None else: name = columns.pop(index_col) else: # given a list of index idx_names = [] index = [] for idx in index_col: idx_names.append(columns[idx]) index.append(zipped_content[idx]) #remove index items from content and columns, don't pop in loop for i in range(len(index_col)): columns.remove(idx_names[i]) zipped_content.remove(index[i]) if np.isscalar(index_col): if parse_dates: index = lib.try_parse_dates(index, parser=date_parser) index = Index(_convert_types(index, na_values), name=name) else: arrays = _maybe_convert_int_mindex(index, parse_dates, date_parser) index = MultiIndex.from_arrays(arrays, names=idx_names) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index._get_duplicates() raise Exception('Index has duplicates: %s' % str(dups)) if len(columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in zip(columns, zipped_content)) data = _convert_to_ndarrays(data, na_values) return DataFrame(data=data, columns=columns, index=index)