def make_skeleton(path, relations, item_rows, gzip=False): """ Instantiate a new profile skeleton (only the relations file and item file) from an existing relations file and a list of rows for the item table. For standard relations files, it is suggested to have, as a minimum, the `i-id` and `i-input` fields in the item rows. Args: path: the destination directory of the skeleton---must not already exist, as it will be created relations: the path to the relations file item_rows: the rows to use for the item file gzip: if `True`, the item file will be compressed Returns: An ItsdbProfile containing the skeleton data (but the profile data will already have been written to disk). Raises: ItsdbError if the destination directory could not be created. """ try: os.makedirs(path) except OSError: raise ItsdbError('Path already exists: {}.'.format(path)) import shutil shutil.copyfile(relations, os.path.join(path, _relations_filename)) prof = ItsdbProfile(path, index=False) prof.write_table('item', item_rows, gzip=gzip) return prof
def join(self, table1, table2, key_filter=True): """ Yield rows from a table built by joining *table1* and *table2*. The column names in the rows have the original table name prepended and separated by a colon. For example, joining tables 'item' and 'parse' will result in column names like 'item:i-input' and 'parse:parse-id'. """ get_keys = lambda t: (f.name for f in self.relations[t] if f.key) keys = set(get_keys(table1)).intersection(get_keys(table2)) if not keys: raise ItsdbError( 'Cannot join tables "{}" and "{}"; no shared key exists.'. format(table1, table2)) key = keys.pop() # this join method stores the whole of table2 in memory, but it is # MUCH faster than a nested loop method. Most profiles will fit in # memory anyway, so it's a decent tradeoff table2_data = defaultdict(list) for row in self.read_table(table2, key_filter=key_filter): table2_data[row[key]].append(row) for row1 in self.read_table(table1, key_filter=key_filter): for row2 in table2_data.get(row1[key], []): joinedrow = OrderedDict([('{}:{}'.format(table1, k), v) for k, v in row1.items()] + [('{}:{}'.format(table2, k), v) for k, v in row2.items()]) yield joinedrow
def _write_table(profile_dir, table_name, rows, fields, append=False, gzip=False): # don't gzip if empty rows = iter(rows) try: first_row = next(rows) except StopIteration: gzip = False else: rows = chain([first_row], rows) if gzip and append: logging.warning('Appending to a gzip file may result in ' 'inefficient compression.') if not os.path.exists(profile_dir): raise ItsdbError( 'Profile directory does not exist: {}'.format(profile_dir)) tbl_filename = os.path.join(profile_dir, table_name) mode = 'a' if append else 'w' if gzip: # text mode only from py3.3; until then use TextIOWrapper #mode += 't' # text mode for gzip f = TextIOWrapper(gzopen(tbl_filename + '.gz', mode=mode)) else: f = open(tbl_filename, mode=mode) for row in rows: f.write(make_row(row, fields) + '\n') f.close()
def add_applicator(self, table, cols, function): """ Add an applicator. When reading *table*, rows in *table* will be modified by apply_rows(). Args: table: The table to apply the function to. cols: The columns in *table* to apply the function on. function: The applicator function. """ if table not in self.relations: raise ItsdbError('Cannot add applicator; table "{}" is not ' 'defined by the relations file.'.format(table)) if cols is None: raise ItsdbError('Cannot add applicator; columns not specified.') fields = set(f.name for f in self.relations[table]) for col in cols: if col not in fields: raise ItsdbError('Cannot add applicator; column "{}" not ' 'defined by the relations file.'.format(col)) self.applicators[table].append((cols, function))
def add_filter(self, table, cols, condition): """ Add a filter. When reading *table*, rows in *table* will be filtered by filter_rows(). Args: table: The table the filter applies to. cols: The columns in *table* to filter on. condition: The filter function. """ if table is not None and table not in self.relations: raise ItsdbError('Cannot add filter; table "{}" is not defined ' 'by the relations file.'.format(table)) # this is a hack, though perhaps well-motivated if cols is None: cols = [None] self.filters[table].append((cols, condition))
def _table_filename(tbl_filename): if tbl_filename.endswith('.gz'): gzfn = tbl_filename txfn = tbl_filename[:-3] else: txfn = tbl_filename gzfn = tbl_filename + '.gz' if os.path.exists(txfn): if (os.path.exists(gzfn) and os.stat(gzfn).st_mtime > os.stat(txfn).st_mtime): tbl_filename = gzfn else: tbl_filename = txfn elif os.path.exists(gzfn): tbl_filename = gzfn else: raise ItsdbError( 'Table does not exist at {}(.gz)'.format(tbl_filename)) return tbl_filename
def _open_table(tbl_filename): if tbl_filename.endswith('.gz'): gz_filename = tbl_filename tbl_filename = tbl_filename[:-3] else: gz_filename = tbl_filename + '.gz' if os.path.exists(tbl_filename) and os.path.exists(gz_filename): logging.warning( 'Both gzipped and plaintext files were found; attempting to ' 'use the plaintext one.') if os.path.exists(tbl_filename): with open(tbl_filename) as f: yield f elif os.path.exists(gz_filename): # text mode only from py3.3; until then use TextIOWrapper with TextIOWrapper( BufferedReader(gzopen(tbl_filename + '.gz', mode='r'))) as f: yield f else: raise ItsdbError( 'Table does not exist at {}(.gz)'.format(tbl_filename))
def select_rows(cols, rows, mode='list'): """ Yield data selected from rows. It is sometimes useful to select a subset of data from a profile. This function selects the data in *cols* from *rows* and yields it in a form specified by *mode*. Possible values of *mode* are: | mode | description | example `['i-id', 'i-wf']` | | -------------- | ----------------- | -------------------------- | | list (default) | a list of values | `[10, 1]` | | dict | col to value map | `{'i-id':'10','i-wf':'1'}` | | row | [incr tsdb()] row | `'10@1'` | Args: cols: an iterable of column names to select data for rows: the rows to select column data from mode: the form yielded data should take Yields: Selected data in the form specified by *mode*. """ mode = mode.lower() if mode == 'list': cast = lambda cols, data: data elif mode == 'dict': cast = lambda cols, data: dict(zip(cols, data)) elif mode == 'row': cast = lambda cols, data: encode_row(data) else: raise ItsdbError( 'Invalid mode for select operation: {}\n' ' Valid options include: list, dict, row'.format(mode)) for row in rows: data = [row.get(c) for c in cols] yield cast(cols, data)
def table_relations(self, table): if table not in self.relations: raise ItsdbError( 'Table {} is not defined in the profiles relations.'.format( table)) return self.relations[table]