Exemplo n.º 1
0
 def process_line(self, line):
     """Remove whitespace at the beginning or end of line. Also remove
     \\ at end of line"""
     line = line.split('%')[0]
     line = line.strip()
     if line[-2:] ==r'\\':
         line = line.strip(r'\\')
     else:
         raise core.InconsistentTableError(r'Lines in LaTeX table have to end with \\')
     return line
Exemplo n.º 2
0
    def get_cols(self, lines):
        """Initialize the header Column objects from the table ``lines``.

        Based on the previously set Header attributes find or create the column names.
        Sets ``self.cols`` with the list of Columns.  This list only includes the actual
        requested columns after filtering by the include_names and exclude_names
        attributes.  See ``self.names`` for the full list.

        :param lines: list of table lines
        :returns: list of table Columns
        """

        if self.names is None:
            # No column names supplied so first try to get them from NumPy structured array
            try:
                self.names = lines.dtype.names
            except AttributeError:
                # Still no col names available so auto-generate them
                try:
                    first_data_vals = next(iter(lines))
                except StopIteration:
                    raise core.InconsistentTableError(
                        'No data lines found so cannot autogenerate column names'
                    )
                n_data_cols = len(first_data_vals)
                self.names = [
                    self.auto_format % i for i in range(1, n_data_cols + 1)
                ]

        self._set_cols_from_names()

        # ``lines`` could be one of: NumPy recarray, DictLikeNumpy obj, Python
        # list of lists. If NumPy recarray then set col.type accordingly.  In
        # the other two cases convert the data values to strings so the usual
        # data converter processing will get the correct type.
        if core.has_numpy and isinstance(lines, numpy.ndarray):
            for col in self.cols:
                type_name = lines[col.name].dtype.name
                if 'int' in type_name:
                    col.type = core.IntType
                elif 'float' in type_name:
                    col.type = core.FloatType
                elif 'str' in type_name:
                    col.type = core.StrType
        else:
            # lines is a list of lists or DictLikeNumpy.
            col_types = {}
            col_indexes = [col.index for col in self.cols]
            for vals in lines:
                for col_index in col_indexes:
                    val = vals[col_index]
                    col_type_set = col_types.setdefault(col_index, set())
                    col_type_set.add(get_val_type(val))
            for col in self.cols:
                col.type = get_lowest_type(col_types[col.index])
Exemplo n.º 3
0
 def process_line(self, line):
     """extract column names from tablehead
     """
     line = line.split('%')[0]
     line = line.replace(r'\tablehead','')
     line = line.strip()
     if (line[0] =='{') and (line[-1] == '}'):
         line = line[1:-1]
     else:
         raise core.InconsistentTableError(r'\tablehead is missing {}')
     return line.replace(r'\colhead','')
Exemplo n.º 4
0
 def process_lines(self, lines):
     """Skip over CDS header by finding the last section delimiter"""
     # If the header has a ReadMe and data has a filename
     # then no need to skip, as the data lines do not have header
     # info. The ``read`` method adds the table_name to the ``data``
     # attribute.
     if self.header.readme and self.table_name:
         return lines
     i_sections = [
         i for (i, x) in enumerate(lines)
         if x.startswith('------') or x.startswith('=======')
     ]
     if not i_sections:
         raise core.InconsistentTableError('No CDS section delimiter found')
     return lines[i_sections[-1] + 1:]
Exemplo n.º 5
0
    def read(self, table):
        self.data.header = self.header
        self.header.data = self.data

        self.lines = self.inputter.get_lines(table, self.header.names)
        self.data.get_data_lines(self.lines)
        self.header.get_cols(self.lines)
        cols = self.header.cols  # header.cols corresponds to *output* columns requested
        n_data_cols = len(
            self.header.names
        )  # header.names corresponds to *all* header columns in table
        self.data.splitter.cols = cols

        for i, str_vals in enumerate(self.data.get_str_vals()):
            if len(list(str_vals)) != n_data_cols:
                errmsg = (
                    'Number of header columns (%d) inconsistent with '
                    'data columns (%d) at data line %d\n'
                    'Header values: %s\n'
                    'Data values: %s' %
                    (len(cols), len(str_vals), i, [x.name
                                                   for x in cols], str_vals))
                raise core.InconsistentTableError(errmsg)

            for col in cols:
                col.str_vals.append(str_vals[col.index])

        self.data.masks(cols)
        self.cols = cols
        if hasattr(table, 'keywords'):
            self.keywords = table.keywords

        self.outputter.default_converters = [
            ((lambda vals: vals), core.IntType),
            ((lambda vals: vals), core.FloatType),
            ((lambda vals: vals), core.StrType)
        ]
        self.table = self.outputter(cols)
        self.cols = self.header.cols

        return self.table
Exemplo n.º 6
0
    def get_cols(self, lines):
        """Initialize the header Column objects from the table ``lines`` for a DAOphot
        header.  The DAOphot header is specialized so that we just copy the entire BaseHeader
        get_cols routine and modify as needed.

        :param lines: list of table lines
        :returns: list of table Columns
        """

        self.names = []
        re_name_def = re.compile(r'#N([^#]+)#')
        for line in lines:
            if not line.startswith('#'):
                break  # End of header lines
            else:
                match = re_name_def.search(line)
                if match:
                    self.names.extend(match.group(1).split())

        if not self.names:
            raise core.InconsistentTableError(
                'No column names found in DAOphot header')

        self._set_cols_from_names()
Exemplo n.º 7
0
def _guess(table, read_kwargs):
    """Try to read the table using various sets of keyword args. First try the
    original args supplied in the read() call. Then try the standard guess
    keyword args. For each key/val pair specified explicitly in the read()
    call make sure that if there is a corresponding definition in the guess
    then it must have the same val.  If not then skip this guess."""

    # Keep a trace of all failed guesses kwarg
    failed_kwargs = []

    # First try guessing
    for guess_kwargs in [read_kwargs.copy()] + _get_guess_kwargs_list():
        guess_kwargs_ok = True  # guess_kwargs are consistent with user_kwargs?
        for key, val in read_kwargs.items():
            # Do guess_kwargs.update(read_kwargs) except that if guess_args has
            # a conflicting key/val pair then skip this guess entirely.
            if key not in guess_kwargs:
                guess_kwargs[key] = val
            elif val != guess_kwargs[key]:
                guess_kwargs_ok = False
                break

        if not guess_kwargs_ok:
            # User-supplied kwarg is inconsistent with the guess-supplied kwarg, e.g.
            # user supplies delimiter="|" but the guess wants to try delimiter=" ", 
            # so skip the guess entirely.
            continue

        try:
            reader = get_reader(**guess_kwargs)
            dat = reader.read(table)
            # When guessing impose additional requirements on column names and number of cols
            bads = [" ", ",", "|", "\t", "'", '"']
            if (len(reader.cols) <= 1 or
                any(_is_number(col.name) or 
                     len(col.name) == 0 or 
                     col.name[0] in bads or 
                     col.name[-1] in bads for col in reader.cols)):
                raise ValueError
            return dat
        except (core.InconsistentTableError, ValueError, TypeError):
            failed_kwargs.append(guess_kwargs)
            pass
    else:
        # failed all guesses, try the original read_kwargs without column requirements
        try:
            reader = get_reader(**read_kwargs)
            return reader.read(table)
        except (core.InconsistentTableError, ValueError):
            failed_kwargs.append(read_kwargs)
            lines = ['\nERROR: Unable to guess table for with the guesses listed below:']
            for kwargs in failed_kwargs:
                sorted_keys = sorted([x for x in sorted(kwargs) if x not in ('Reader', 'Outputter')])
                reader_repr = repr(kwargs.get('Reader', basic.Basic))
                keys_vals = ['Reader:' + re.search(r"\.(\w+)'>", reader_repr).group(1)]
                kwargs_sorted = ((key, kwargs[key]) for key in sorted_keys)
                keys_vals.extend(['%s: %s' % (key, repr(val)) for key, val in kwargs_sorted])
                lines.append(' '.join(keys_vals))
            lines.append('ERROR: Unable to guess table for with the guesses listed above.')
            lines.append('Check the table and try with guess=False and appropriate arguments to read()')
            raise core.InconsistentTableError('\n'.join(lines))
Exemplo n.º 8
0
    def get_cols(self, lines):
        """Initialize the header Column objects from the table ``lines`` for a CDS
        header. 

        :param lines: list of table lines
        :returns: list of table Columns
        """
        # Read header block for the table ``self.data.table_name`` from the read
        # me file ``self.readme``.
        if self.readme and self.data.table_name:
            in_header = False
            f = open(self.readme, "r")
            # Header info is not in data lines but in a separate file.
            lines = []
            comment_lines = 0
            for line in f:
                line = line.strip()
                if in_header:
                    lines.append(line)
                    if line.startswith('------') or line.startswith('======='):
                        comment_lines += 1
                        if comment_lines == 3:
                            break
                else:
                    match = re.match(
                        r'Byte-by-byte Description of file: (?P<name>.+)$',
                        line, re.IGNORECASE)
                    if match:
                        # Split 'name' in case in contains multiple files
                        names = [
                            s for s in re.split('[, ]+', match.group('name'))
                            if s
                        ]
                        # Iterate on names to find if one matches the tablename
                        # including wildcards.
                        for pattern in names:
                            if fnmatch.fnmatch(self.data.table_name, pattern):
                                in_header = True
                                lines.append(line)
                                break

            else:
                raise core.InconsistentTableError(
                    "Cant' find table {0} in {1}".format(
                        self.data.table_name, self.readme))
            f.close()

        for i_col_def, line in enumerate(lines):
            if re.match(r'Byte-by-byte Description', line, re.IGNORECASE):
                break

        re_col_def = re.compile(
            r"""\s*
                                    (?P<start> \d+ \s* -)? \s*
                                    (?P<end>   \d+)        \s+
                                    (?P<format> [\w.]+)     \s+
                                    (?P<units> \S+)        \s+
                                    (?P<name>  \S+)        \s+
                                    (?P<descr> \S.+)""", re.VERBOSE)

        cols = []
        for i, line in enumerate(itertools.islice(lines, i_col_def + 4, None)):
            if line.startswith('------') or line.startswith('======='):
                break
            match = re_col_def.match(line)
            if match:
                col = core.Column(name=match.group('name'), index=i)
                col.start = int(
                    re.sub(r'[-\s]', '',
                           match.group('start') or match.group('end'))) - 1
                col.end = int(match.group('end'))
                col.units = match.group('units')
                col.descr = match.group('descr')
                col.raw_type = match.group('format')
                col.type = self.get_col_type(col)

                match = re.match(r'\? (?P<equal> =)? (?P<nullval> \S*)',
                                 col.descr, re.VERBOSE)
                if match:
                    if issubclass(col.type, core.FloatType):
                        fillval = 'nan'
                    else:
                        fillval = '-999'
                    if match.group('nullval') == '':
                        col.null = ''
                    elif match.group('nullval') == '-':
                        col.null = '---'
                    else:
                        col.null = match.group('nullval')
                    self.data.fill_values.append((col.null, fillval, col.name))

                cols.append(col)
            else:  # could be a continuation of the previous col's description
                if cols:
                    cols[-1].descr += line.strip()
                else:
                    raise ValueError('Line "%s" not parsable as CDS header' %
                                     line)

        self.names = [x.name for x in cols]
        names = set(self.names)
        if self.include_names is not None:
            names.intersection_update(self.include_names)
        if self.exclude_names is not None:
            names.difference_update(self.exclude_names)

        self.cols = [x for x in cols if x.name in names]
        self.n_data_cols = len(self.cols)

        # Re-index the cols because the FixedWidthSplitter does NOT return the ignored
        # cols (as is the case for typical delimiter-based splitters)
        for i, col in enumerate(self.cols):
            col.index = i