def process_line(self, line): """Remove whitespace at the beginning or end of line. Also remove \\ at end of line""" line = line.split('%')[0] line = line.strip() if line[-2:] ==r'\\': line = line.strip(r'\\') else: raise core.InconsistentTableError(r'Lines in LaTeX table have to end with \\') return line
def get_cols(self, lines): """Initialize the header Column objects from the table ``lines``. Based on the previously set Header attributes find or create the column names. Sets ``self.cols`` with the list of Columns. This list only includes the actual requested columns after filtering by the include_names and exclude_names attributes. See ``self.names`` for the full list. :param lines: list of table lines :returns: list of table Columns """ if self.names is None: # No column names supplied so first try to get them from NumPy structured array try: self.names = lines.dtype.names except AttributeError: # Still no col names available so auto-generate them try: first_data_vals = next(iter(lines)) except StopIteration: raise core.InconsistentTableError( 'No data lines found so cannot autogenerate column names' ) n_data_cols = len(first_data_vals) self.names = [ self.auto_format % i for i in range(1, n_data_cols + 1) ] self._set_cols_from_names() # ``lines`` could be one of: NumPy recarray, DictLikeNumpy obj, Python # list of lists. If NumPy recarray then set col.type accordingly. In # the other two cases convert the data values to strings so the usual # data converter processing will get the correct type. if core.has_numpy and isinstance(lines, numpy.ndarray): for col in self.cols: type_name = lines[col.name].dtype.name if 'int' in type_name: col.type = core.IntType elif 'float' in type_name: col.type = core.FloatType elif 'str' in type_name: col.type = core.StrType else: # lines is a list of lists or DictLikeNumpy. col_types = {} col_indexes = [col.index for col in self.cols] for vals in lines: for col_index in col_indexes: val = vals[col_index] col_type_set = col_types.setdefault(col_index, set()) col_type_set.add(get_val_type(val)) for col in self.cols: col.type = get_lowest_type(col_types[col.index])
def process_line(self, line): """extract column names from tablehead """ line = line.split('%')[0] line = line.replace(r'\tablehead','') line = line.strip() if (line[0] =='{') and (line[-1] == '}'): line = line[1:-1] else: raise core.InconsistentTableError(r'\tablehead is missing {}') return line.replace(r'\colhead','')
def process_lines(self, lines): """Skip over CDS header by finding the last section delimiter""" # If the header has a ReadMe and data has a filename # then no need to skip, as the data lines do not have header # info. The ``read`` method adds the table_name to the ``data`` # attribute. if self.header.readme and self.table_name: return lines i_sections = [ i for (i, x) in enumerate(lines) if x.startswith('------') or x.startswith('=======') ] if not i_sections: raise core.InconsistentTableError('No CDS section delimiter found') return lines[i_sections[-1] + 1:]
def read(self, table): self.data.header = self.header self.header.data = self.data self.lines = self.inputter.get_lines(table, self.header.names) self.data.get_data_lines(self.lines) self.header.get_cols(self.lines) cols = self.header.cols # header.cols corresponds to *output* columns requested n_data_cols = len( self.header.names ) # header.names corresponds to *all* header columns in table self.data.splitter.cols = cols for i, str_vals in enumerate(self.data.get_str_vals()): if len(list(str_vals)) != n_data_cols: errmsg = ( 'Number of header columns (%d) inconsistent with ' 'data columns (%d) at data line %d\n' 'Header values: %s\n' 'Data values: %s' % (len(cols), len(str_vals), i, [x.name for x in cols], str_vals)) raise core.InconsistentTableError(errmsg) for col in cols: col.str_vals.append(str_vals[col.index]) self.data.masks(cols) self.cols = cols if hasattr(table, 'keywords'): self.keywords = table.keywords self.outputter.default_converters = [ ((lambda vals: vals), core.IntType), ((lambda vals: vals), core.FloatType), ((lambda vals: vals), core.StrType) ] self.table = self.outputter(cols) self.cols = self.header.cols return self.table
def get_cols(self, lines): """Initialize the header Column objects from the table ``lines`` for a DAOphot header. The DAOphot header is specialized so that we just copy the entire BaseHeader get_cols routine and modify as needed. :param lines: list of table lines :returns: list of table Columns """ self.names = [] re_name_def = re.compile(r'#N([^#]+)#') for line in lines: if not line.startswith('#'): break # End of header lines else: match = re_name_def.search(line) if match: self.names.extend(match.group(1).split()) if not self.names: raise core.InconsistentTableError( 'No column names found in DAOphot header') self._set_cols_from_names()
def _guess(table, read_kwargs): """Try to read the table using various sets of keyword args. First try the original args supplied in the read() call. Then try the standard guess keyword args. For each key/val pair specified explicitly in the read() call make sure that if there is a corresponding definition in the guess then it must have the same val. If not then skip this guess.""" # Keep a trace of all failed guesses kwarg failed_kwargs = [] # First try guessing for guess_kwargs in [read_kwargs.copy()] + _get_guess_kwargs_list(): guess_kwargs_ok = True # guess_kwargs are consistent with user_kwargs? for key, val in read_kwargs.items(): # Do guess_kwargs.update(read_kwargs) except that if guess_args has # a conflicting key/val pair then skip this guess entirely. if key not in guess_kwargs: guess_kwargs[key] = val elif val != guess_kwargs[key]: guess_kwargs_ok = False break if not guess_kwargs_ok: # User-supplied kwarg is inconsistent with the guess-supplied kwarg, e.g. # user supplies delimiter="|" but the guess wants to try delimiter=" ", # so skip the guess entirely. continue try: reader = get_reader(**guess_kwargs) dat = reader.read(table) # When guessing impose additional requirements on column names and number of cols bads = [" ", ",", "|", "\t", "'", '"'] if (len(reader.cols) <= 1 or any(_is_number(col.name) or len(col.name) == 0 or col.name[0] in bads or col.name[-1] in bads for col in reader.cols)): raise ValueError return dat except (core.InconsistentTableError, ValueError, TypeError): failed_kwargs.append(guess_kwargs) pass else: # failed all guesses, try the original read_kwargs without column requirements try: reader = get_reader(**read_kwargs) return reader.read(table) except (core.InconsistentTableError, ValueError): failed_kwargs.append(read_kwargs) lines = ['\nERROR: Unable to guess table for with the guesses listed below:'] for kwargs in failed_kwargs: sorted_keys = sorted([x for x in sorted(kwargs) if x not in ('Reader', 'Outputter')]) reader_repr = repr(kwargs.get('Reader', basic.Basic)) keys_vals = ['Reader:' + re.search(r"\.(\w+)'>", reader_repr).group(1)] kwargs_sorted = ((key, kwargs[key]) for key in sorted_keys) keys_vals.extend(['%s: %s' % (key, repr(val)) for key, val in kwargs_sorted]) lines.append(' '.join(keys_vals)) lines.append('ERROR: Unable to guess table for with the guesses listed above.') lines.append('Check the table and try with guess=False and appropriate arguments to read()') raise core.InconsistentTableError('\n'.join(lines))
def get_cols(self, lines): """Initialize the header Column objects from the table ``lines`` for a CDS header. :param lines: list of table lines :returns: list of table Columns """ # Read header block for the table ``self.data.table_name`` from the read # me file ``self.readme``. if self.readme and self.data.table_name: in_header = False f = open(self.readme, "r") # Header info is not in data lines but in a separate file. lines = [] comment_lines = 0 for line in f: line = line.strip() if in_header: lines.append(line) if line.startswith('------') or line.startswith('======='): comment_lines += 1 if comment_lines == 3: break else: match = re.match( r'Byte-by-byte Description of file: (?P<name>.+)$', line, re.IGNORECASE) if match: # Split 'name' in case in contains multiple files names = [ s for s in re.split('[, ]+', match.group('name')) if s ] # Iterate on names to find if one matches the tablename # including wildcards. for pattern in names: if fnmatch.fnmatch(self.data.table_name, pattern): in_header = True lines.append(line) break else: raise core.InconsistentTableError( "Cant' find table {0} in {1}".format( self.data.table_name, self.readme)) f.close() for i_col_def, line in enumerate(lines): if re.match(r'Byte-by-byte Description', line, re.IGNORECASE): break re_col_def = re.compile( r"""\s* (?P<start> \d+ \s* -)? \s* (?P<end> \d+) \s+ (?P<format> [\w.]+) \s+ (?P<units> \S+) \s+ (?P<name> \S+) \s+ (?P<descr> \S.+)""", re.VERBOSE) cols = [] for i, line in enumerate(itertools.islice(lines, i_col_def + 4, None)): if line.startswith('------') or line.startswith('======='): break match = re_col_def.match(line) if match: col = core.Column(name=match.group('name'), index=i) col.start = int( re.sub(r'[-\s]', '', match.group('start') or match.group('end'))) - 1 col.end = int(match.group('end')) col.units = match.group('units') col.descr = match.group('descr') col.raw_type = match.group('format') col.type = self.get_col_type(col) match = re.match(r'\? (?P<equal> =)? (?P<nullval> \S*)', col.descr, re.VERBOSE) if match: if issubclass(col.type, core.FloatType): fillval = 'nan' else: fillval = '-999' if match.group('nullval') == '': col.null = '' elif match.group('nullval') == '-': col.null = '---' else: col.null = match.group('nullval') self.data.fill_values.append((col.null, fillval, col.name)) cols.append(col) else: # could be a continuation of the previous col's description if cols: cols[-1].descr += line.strip() else: raise ValueError('Line "%s" not parsable as CDS header' % line) self.names = [x.name for x in cols] names = set(self.names) if self.include_names is not None: names.intersection_update(self.include_names) if self.exclude_names is not None: names.difference_update(self.exclude_names) self.cols = [x for x in cols if x.name in names] self.n_data_cols = len(self.cols) # Re-index the cols because the FixedWidthSplitter does NOT return the ignored # cols (as is the case for typical delimiter-based splitters) for i, col in enumerate(self.cols): col.index = i