def create_column(self, col_name, col_type): ''' Takes the output from parse_col_desc and creates the right column type. This method returns one of Column, ArrayColumn, MapColumn, StructColumn.''' if isinstance(col_type, str): if col_type.upper() == 'VARCHAR': col_type = 'STRING' type_name = self.TYPE_NAME_ALIASES.get(col_type.upper()) return Column(owner=None, name=col_name.lower(), exact_type=self.TYPES_BY_NAME[type_name]) general_class = col_type[0] if general_class.upper() == 'ARRAY': return ArrayColumn(owner=None, name=col_name.lower(), item=self.create_column(col_name='item', col_type=col_type[1])) if general_class.upper() == 'MAP': return MapColumn(owner=None, name=col_name.lower(), key=self.create_column(col_name='key', col_type=col_type[1]), value=self.create_column(col_name='value', col_type=col_type[2])) if general_class.upper() == 'STRUCT': struct_col = StructColumn(owner=None, name=col_name.lower()) for field_name, field_type in col_type[1:]: struct_col.add_col(self.create_column(field_name, field_type)) return struct_col general_class = self.TYPE_NAME_ALIASES.get(col_type[0].upper()) if general_class.upper() == 'DECIMAL': return Column(owner=None, name=col_name.lower(), exact_type=get_decimal_class(int(col_type[1]), int(col_type[2]))) if general_class.upper() == 'CHAR': return Column(owner=None, name=col_name.lower(), exact_type=get_char_class(int(col_type[1]))) if general_class.upper() == 'VARCHAR': type_size = int(col_type[1]) if type_size <= VarChar.MAX: cur_type = get_varchar_class(type_size) else: cur_type = self.TYPES_BY_NAME['STRING'] return Column(owner=None, name=col_name.lower(), exact_type=cur_type) raise Exception('unable to parse: {0}, type: {1}'.format( col_name, col_type))
def FakeColumn(name, type_, is_primary_key=False): """ Return a Column, the creation of which allows the user not to have to specify the first argument, which is the table to which the column belongs. Typical use should be when creating a FakeTable, use FakeColumns as arguments. """ col = Column(None, name, type_) col.is_primary_key = is_primary_key return col
def test_table_model(self, cursor, hive_cursor): table = Table("some_test_table") cursor.drop_table(table.name, if_exists=True) table.storage_format = 'textfile' table.add_col(Column(table, "bigint_col", BigInt)) table.add_col(Column(table, "string_col", String)) cursor.create_table(table) try: other = hive_cursor.describe_table(table.name) assert other.name == table.name assert other.cols == table.cols finally: cursor.drop_table(table.name)
def create_join_predicate(self, parent_table, child_table): for col in parent_table.cols: if col.name == 'id': parent_id_col = col break else: parent_id_col = Column(parent_table, 'id', BigInt) parent_id_col.for_flattening = True parent_table.add_col(parent_id_col) child_col_name = self.flat_collection_name(parent_table) + '_id' child_col = Column(None, child_col_name, BigInt) child_table.add_col(child_col) return Equals.create_from_args(parent_id_col, child_col)
def _create_random_table(self, table_name, min_col_count, max_col_count, allowed_storage_formats): '''Create and return a Table with a random number of cols.''' col_count = randint(min_col_count, max_col_count) storage_format = choice(allowed_storage_formats) table = Table(table_name) table.storage_format = storage_format allowed_types = list(TYPES) # Avro doesn't support timestamps yet. if table.storage_format == 'AVRO': allowed_types.remove(Timestamp) # TODO: 'table.cols' returns a copy of all scalar cols, so 'table.cols.append()' # doesn't actually modify the table's columns. 'table.cols' should be changed # to allow access to the real columns. cols = table.cols for col_idx in xrange(col_count): col_type = choice(allowed_types) col_type = choice( filter(lambda type_: issubclass(type_, col_type), EXACT_TYPES)) if issubclass(col_type, VarChar) and not issubclass(col_type, String): col_type = get_varchar_class(randint(1, VarChar.MAX)) elif issubclass(col_type, Char) and not issubclass(col_type, String): col_type = get_char_class(randint(1, Char.MAX)) elif issubclass(col_type, Decimal): max_digits = randint(1, Decimal.MAX_DIGITS) col_type = get_decimal_class(max_digits, randint(1, max_digits)) col = Column( table, '%s_col_%s' % (col_type.__name__.lower(), col_idx + 1), col_type) cols.append(col) table.cols = cols return table
def create_random_table(self, table_name, min_number_of_cols, max_number_of_cols, allowed_storage_formats): '''Create and return a Table with a random number of cols.''' col_count = randint(min_number_of_cols, max_number_of_cols) storage_format = choice(allowed_storage_formats) table = Table(table_name) table.storage_format = storage_format for col_idx in xrange(col_count): col_type = choice(TYPES) col_type = choice( filter(lambda type_: issubclass(type_, col_type), EXACT_TYPES)) if issubclass(col_type, VarChar) and not issubclass(col_type, String): col_type = get_varchar_class(randint(1, VarChar.MAX)) elif issubclass(col_type, Char) and not issubclass(col_type, String): col_type = get_char_class(randint(1, Char.MAX)) elif issubclass(col_type, Decimal): max_digits = randint(1, Decimal.MAX_DIGITS) col_type = get_decimal_class(max_digits, randint(1, max_digits)) col = Column( table, '%s_col_%s' % (col_type.__name__.lower(), col_idx + 1), col_type) table.cols.append(col) return table
def test_hive_create_equality_only_joins(): """ Tests that QueryGenerator produces a join condition with only equality functions if the HiveProfile is used. """ class FakeHiveQueryProfile(HiveProfile): """ A fake QueryProfile that extends the HiveProfile, various weights are modified in order to ensure that this test is deterministic. """ def choose_join_condition_count(self): """ There should be only one operator in the JOIN condition """ return 1 def choose_conjunct_disjunct_fill_ratio(self): """ There should be no AND or OR operators """ return 0 def choose_relational_func_fill_ratio(self): """ Force all operators to be relational """ return 1 query_generator = QueryGenerator(FakeHiveQueryProfile()) # Create two tables that have one joinable Column right_table_expr_list = TableExprList() right_table = Table("right_table") right_table.add_col(Column("right_table", "right_col", Int)) right_table_expr_list.append(right_table) left_table_expr_list = TableExprList() left_table = Table("left_table") left_table.add_col(Column("left_table", "left_col", Int)) left_table_expr_list.append(left_table) # Validate the root predicate is an Equals funcs assert isinstance(query_generator._create_relational_join_condition( right_table_expr_list, left_table_expr_list), Equals)
def test_use_nested_width_subquery(): """ Tests that setting DefaultProfile.use_nested_with to False works properly. Setting this method to return False should prevent a WITH clause from being used inside a sub-query. """ class MockQueryProfile(DefaultProfile): """ A mock QueryProfile that sets use_nested_with to False and forces the QueryGenerator to created nested queries. """ def __init__(self): super(MockQueryProfile, self).__init__() # Force the QueryGenerator to create nested queries self._bounds['MAX_NESTED_QUERY_COUNT'] = (4, 4) # Force the QueryGenerator to use WITH clauses whenever possible self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 1 # Force the QueryGenerator to create inline views whenever possible self._probabilities['MISC']['INLINE_VIEW'] = 1 def use_nested_with(self): return False mock_query_gen = QueryGenerator(MockQueryProfile()) # Create two tables table_expr_list = TableExprList() right_table = Table("right_table") right_table.add_col(Column("right_table", "right_col", Int)) table_expr_list.append(right_table) left_table = Table("left_table") left_table.add_col(Column("left_table", "left_col", Int)) table_expr_list.append(left_table) # Check that each nested_query doesn't have a with clause for nested_query in mock_query_gen.generate_statement(table_expr_list).nested_queries: assert nested_query.with_clause is None
def describe_table(self, table_name): '''Return a Table with table and col names always in lowercase.''' rows = self.execute_and_fetchall(self.make_describe_table_sql(table_name)) table = Table(table_name.lower()) for row in rows: col_name, data_type = row[:2] if data_type == 'tinyint(1)': # Just assume this is a boolean... data_type = 'boolean' if 'decimal' not in data_type and '(' in data_type: # Strip the size of the data type data_type = data_type[:data_type.index('(')] table.cols.append(Column(table, col_name.lower(), self.parse_data_type(data_type))) return table
def describe_table(self, table_name): '''Return a Table with table and col names always in lowercase.''' rows = self.execute_and_fetchall(self.make_describe_table_sql(table_name)) table = Table(table_name.lower()) for row in rows: col_name, data_type = row[:2] match = self.SQL_TYPE_PATTERN.match(data_type) if not match: raise Exception('Unexpected data type format: %s' % data_type) type_name = self.TYPE_NAME_ALIASES.get(match.group(1).upper()) if not type_name: raise Exception('Unknown data type: ' + match.group(1)) if len(match.groups()) > 1 and match.group(2) is not None: type_size = [int(size) for size in match.group(2)[1:-1].split(',')] else: type_size = None table.cols.append( Column(table, col_name.lower(), self.parse_data_type(type_name, type_size))) self.load_unique_col_metadata(table) return table
def cols(self): return ValExprList( Column(self, item.name, item.type) for item in self.query.select_clause.items)