Exemplo n.º 1
0
def make_skeleton(path, relations, item_rows, gzip=False):
    """
    Instantiate a new profile skeleton (only the relations file and
    item file) from an existing relations file and a list of rows
    for the item table. For standard relations files, it is suggested
    to have, as a minimum, the `i-id` and `i-input` fields in the
    item rows.

    Args:
        path: the destination directory of the skeleton---must not
              already exist, as it will be created
        relations: the path to the relations file
        item_rows: the rows to use for the item file
        gzip: if `True`, the item file will be compressed
    Returns:
        An ItsdbProfile containing the skeleton data (but the profile
        data will already have been written to disk).
    Raises:
        ItsdbError if the destination directory could not be created.
    """
    try:
        os.makedirs(path)
    except OSError:
        raise ItsdbError('Path already exists: {}.'.format(path))
    import shutil
    shutil.copyfile(relations, os.path.join(path, _relations_filename))
    prof = ItsdbProfile(path, index=False)
    prof.write_table('item', item_rows, gzip=gzip)
    return prof
Exemplo n.º 2
0
 def join(self, table1, table2, key_filter=True):
     """
     Yield rows from a table built by joining *table1* and *table2*.
     The column names in the rows have the original table name
     prepended and separated by a colon. For example, joining tables
     'item' and 'parse' will result in column names like
     'item:i-input' and 'parse:parse-id'.
     """
     get_keys = lambda t: (f.name for f in self.relations[t] if f.key)
     keys = set(get_keys(table1)).intersection(get_keys(table2))
     if not keys:
         raise ItsdbError(
             'Cannot join tables "{}" and "{}"; no shared key exists.'.
             format(table1, table2))
     key = keys.pop()
     # this join method stores the whole of table2 in memory, but it is
     # MUCH faster than a nested loop method. Most profiles will fit in
     # memory anyway, so it's a decent tradeoff
     table2_data = defaultdict(list)
     for row in self.read_table(table2, key_filter=key_filter):
         table2_data[row[key]].append(row)
     for row1 in self.read_table(table1, key_filter=key_filter):
         for row2 in table2_data.get(row1[key], []):
             joinedrow = OrderedDict([('{}:{}'.format(table1, k), v)
                                      for k, v in row1.items()] +
                                     [('{}:{}'.format(table2, k), v)
                                      for k, v in row2.items()])
             yield joinedrow
Exemplo n.º 3
0
def _write_table(profile_dir,
                 table_name,
                 rows,
                 fields,
                 append=False,
                 gzip=False):
    # don't gzip if empty
    rows = iter(rows)
    try:
        first_row = next(rows)
    except StopIteration:
        gzip = False
    else:
        rows = chain([first_row], rows)
    if gzip and append:
        logging.warning('Appending to a gzip file may result in '
                        'inefficient compression.')

    if not os.path.exists(profile_dir):
        raise ItsdbError(
            'Profile directory does not exist: {}'.format(profile_dir))

    tbl_filename = os.path.join(profile_dir, table_name)
    mode = 'a' if append else 'w'
    if gzip:
        # text mode only from py3.3; until then use TextIOWrapper
        #mode += 't'  # text mode for gzip
        f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode))
    else:
        f = open(tbl_filename, mode=mode)

    for row in rows:
        f.write(make_row(row, fields) + '\n')

    f.close()
Exemplo n.º 4
0
    def add_applicator(self, table, cols, function):
        """
        Add an applicator. When reading *table*, rows in *table* will be
        modified by apply_rows().

        Args:
            table: The table to apply the function to.
            cols: The columns in *table* to apply the function on.
            function: The applicator function.
        """

        if table not in self.relations:
            raise ItsdbError('Cannot add applicator; table "{}" is not '
                             'defined by the relations file.'.format(table))
        if cols is None:
            raise ItsdbError('Cannot add applicator; columns not specified.')
        fields = set(f.name for f in self.relations[table])
        for col in cols:
            if col not in fields:
                raise ItsdbError('Cannot add applicator; column "{}" not '
                                 'defined by the relations file.'.format(col))
        self.applicators[table].append((cols, function))
Exemplo n.º 5
0
    def add_filter(self, table, cols, condition):
        """
        Add a filter. When reading *table*, rows in *table* will be
        filtered by filter_rows().

        Args:
            table: The table the filter applies to.
            cols: The columns in *table* to filter on.
            condition: The filter function.
        """
        if table is not None and table not in self.relations:
            raise ItsdbError('Cannot add filter; table "{}" is not defined '
                             'by the relations file.'.format(table))
        # this is a hack, though perhaps well-motivated
        if cols is None:
            cols = [None]
        self.filters[table].append((cols, condition))
Exemplo n.º 6
0
def _table_filename(tbl_filename):
    if tbl_filename.endswith('.gz'):
        gzfn = tbl_filename
        txfn = tbl_filename[:-3]
    else:
        txfn = tbl_filename
        gzfn = tbl_filename + '.gz'

    if os.path.exists(txfn):
        if (os.path.exists(gzfn)
                and os.stat(gzfn).st_mtime > os.stat(txfn).st_mtime):
            tbl_filename = gzfn
        else:
            tbl_filename = txfn
    elif os.path.exists(gzfn):
        tbl_filename = gzfn
    else:
        raise ItsdbError(
            'Table does not exist at {}(.gz)'.format(tbl_filename))

    return tbl_filename
Exemplo n.º 7
0
def _open_table(tbl_filename):
    if tbl_filename.endswith('.gz'):
        gz_filename = tbl_filename
        tbl_filename = tbl_filename[:-3]
    else:
        gz_filename = tbl_filename + '.gz'

    if os.path.exists(tbl_filename) and os.path.exists(gz_filename):
        logging.warning(
            'Both gzipped and plaintext files were found; attempting to '
            'use the plaintext one.')
    if os.path.exists(tbl_filename):
        with open(tbl_filename) as f:
            yield f
    elif os.path.exists(gz_filename):
        # text mode only from py3.3; until then use TextIOWrapper
        with TextIOWrapper(
                BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))) as f:
            yield f
    else:
        raise ItsdbError(
            'Table does not exist at {}(.gz)'.format(tbl_filename))
Exemplo n.º 8
0
def select_rows(cols, rows, mode='list'):
    """
    Yield data selected from rows.

    It is sometimes useful to select a subset of data from a profile.
    This function selects the data in *cols* from *rows* and yields it
    in a form specified by *mode*. Possible values of *mode* are:

    | mode           | description       | example `['i-id', 'i-wf']` |
    | -------------- | ----------------- | -------------------------- |
    | list (default) | a list of values  | `[10, 1]`                  |
    | dict           | col to value map  | `{'i-id':'10','i-wf':'1'}` |
    | row            | [incr tsdb()] row | `'10@1'`                   |

    Args:
        cols: an iterable of column names to select data for
        rows: the rows to select column data from
        mode: the form yielded data should take

    Yields:
        Selected data in the form specified by *mode*.
    """
    mode = mode.lower()
    if mode == 'list':
        cast = lambda cols, data: data
    elif mode == 'dict':
        cast = lambda cols, data: dict(zip(cols, data))
    elif mode == 'row':
        cast = lambda cols, data: encode_row(data)
    else:
        raise ItsdbError(
            'Invalid mode for select operation: {}\n'
            '  Valid options include: list, dict, row'.format(mode))
    for row in rows:
        data = [row.get(c) for c in cols]
        yield cast(cols, data)
Exemplo n.º 9
0
 def table_relations(self, table):
     if table not in self.relations:
         raise ItsdbError(
             'Table {} is not defined in the profiles relations.'.format(
                 table))
     return self.relations[table]