def _GetAttributeFilterUnion(self, attributes, timestamp_filter=None): filters = [] for attribute_prefix in attributes: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filters.append(row_filter) # More than one attribute, use a union, otherwise just use the # existing filter. if len(filters) > 1: filters = row_filters.RowFilterUnion(filters=filters) else: filters = filters[0] return filters
def filter_limit_col_family_regex(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) rows = table.read_rows( filter_=row_filters.FamilyNameRegexFilter("stats_.*$".encode("utf-8"))) for row in rows: print_row(row)
def filter_composing_chain(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) rows = table.read_rows(filter_=row_filters.RowFilterChain( filters=[row_filters.CellsColumnLimitFilter(1), row_filters.FamilyNameRegexFilter("cell_plan")])) for row in rows: print_row(row)
def read_transaction(column_family_id, user_id, timestamp): global _table if _table is None: init() try: r = _table.read_row( _get_row_key(user_id, timestamp).encode(), filter_=row_filters.FamilyNameRegexFilter(column_family_id)) if r is None: return None cells = r.cells[column_family_id][_COLUMN_ID.encode()] return [(timestamp, cell.value.decode()) for cell in cells] except Exception as e: print(e)
def write_conditional(project_id, instance_id, table_id): client = bigtable.Client(project=project_id, admin=True) instance = client.instance(instance_id) table = instance.table(table_id) timestamp = datetime.datetime.utcnow() column_family_id = "stats_summary" row_key = "phone#4c410523#20190501" row_filter = row_filters.RowFilterChain(filters=[ row_filters.FamilyNameRegexFilter(column_family_id), row_filters.ColumnQualifierRegexFilter("os_build"), row_filters.ValueRegexFilter("PQ2A\\..*"), ]) row = table.conditional_row(row_key, filter_=row_filter) row.set_cell(column_family_id, "os_name", "android", timestamp) row.commit() print("Successfully updated row's os_name.")
def read_transactions(column_family_id, user_id, from_timestamp, to_timestamp): global _table if _table is None: init() rs = _table.read_rows( start_key=_get_row_key(user_id, from_timestamp).encode(), end_key=_get_row_key(user_id, to_timestamp).encode(), filter_=row_filters.FamilyNameRegexFilter(column_family_id)) if rs is None: return None rs.consume_all() res = [] for row_key, row_data in rs.rows.items(): cells = row_data.cells[column_family_id][_COLUMN_ID.encode()] user_id, timestamp = _prase_row_key(row_key) res += [(timestamp, cell.value.decode()) for cell in cells] return res
def MultiResolvePrefix(self, subjects, attribute_prefix, timestamp=None, limit=None, token=None): """Get results from multiple rows matching multiple attributes. We could implement this using read_rows, but it is a table scan. Our current data model makes that slow because it is a directory hierarchy that includes entries for subdirectories interleaved. So if you want all the results for a directory you need to skip those in the scan. Instead we make an RPC for each subject all at once using a threadpool. We pay more in RPC overhead but we get to do it concurrently. Args: subjects: A list of subjects. attribute_prefix: The attribute prefix. timestamp: A range of times for consideration (In microseconds). Can be a constant such as ALL_TIMESTAMPS or NEWEST_TIMESTAMP or a tuple of ints (start, end). limit: The total number of result values to return. token: An ACL token. Yields: A list of tuples: (subject, [(attribute, value string, timestamp)]) that can be simply converted to a dict. Values with the same attribute (happens when timestamp is not NEWEST_TIMESTAMP, but ALL_TIMESTAMPS or time range) are guaranteed to be ordered in the decreasing timestamp order. Raises: AccessError: if anything goes wrong. ValueError: if we get a string instead of a list of subjects. """ self.security_manager.CheckDataStoreAccess( token, subjects, self.GetRequiredResolveAccess(attribute_prefix)) if isinstance(subjects, basestring): raise ValueError("Expected list of subjects, got string: %s" % subjects) if isinstance(attribute_prefix, basestring): attribute_prefix_list = [utils.SmartStr(attribute_prefix)] else: attribute_prefix_list = [utils.SmartStr(x) for x in attribute_prefix] timestamp_filter = self._TimestampToFilter(timestamp) filter_union = [] for attribute_prefix in attribute_prefix_list: family, column = self.GetFamilyColumn(attribute_prefix) family_filter = row_filters.FamilyNameRegexFilter(family) row_filter_list = [family_filter] if column: # Make it an actual regex column += ".*" col_filter = row_filters.ColumnQualifierRegexFilter(column) row_filter_list.append(col_filter) if timestamp_filter: row_filter_list.append(timestamp_filter) if len(row_filter_list) > 1: row_filter = row_filters.RowFilterChain(filters=row_filter_list) else: row_filter = row_filter_list[0] filter_union.append(row_filter) # More than one set of prefixes, use a union, otherwise just use the # existing filter chain. if len(filter_union) > 1: attribute_filter = row_filters.RowFilterUnion(filters=filter_union) else: attribute_filter = filter_union[0] # Apply those filters to each subject as a separate RPC using a threadpool pool_args = [] original_subject_map = {} for subject in subjects: # List of *args, **kwargs to pass to the RPC caller pool_args.append(((self.table.read_row, "read", utils.SmartStr(subject)), { "filter_": attribute_filter })) # We're expected to return subjects as their original type, which can be # URN, unicode, or string. Keep a mapping in this dict. original_subject_map[utils.SmartStr(subject)] = subject max_results = limit or 2**64 for result in self.pool.imap_unordered(self._WrapCallWithRetry, pool_args): if max_results <= 0: break if result: subject_results, max_results = self._GetSubjectResults(result, max_results) yield original_subject_map[ result.row_key], self._SortResultsByAttrTimestampValue( subject_results)