def sort_orderby(self, by, sort_type=None): """ in_place sort for orderby Args: by: list of column names sort_type: list of True/False if ASC for each column name in 'by' i.e [True, False] means [ASC, DESC] """ # if by is None and self.identifier_column in self._frames: # by = [self.identifier_column] if sort_type is None: sort_type = [True] if by is not None: for column in by: if column not in self._frames.columns: logger.error( 'Can not orderby non-projected column: {}'.format( column)) raise KeyError( 'Can not orderby non-projected column: {}'.format( column)) self._frames.sort_values(by, ascending=sort_type, ignore_index=True, inplace=True) else: logger.warn('Columns and Sort Type are required for orderby')
def _bind_load_data_statement(self, node: LoadDataStatement): table_ref = node.table_ref if node.file_options['file_format'] == FileFormatType.VIDEO: # Create a new metadata object create_video_metadata(table_ref.table.table_name) self.bind(table_ref) table_ref_obj = table_ref.table.table_obj if table_ref_obj is None: error = '{} does not exists. Create the table using \ CREATE TABLE.'.format(table_ref.table.table_name) logger.error(error) raise RuntimeError(error) # if query had columns specified, we just copy them if node.column_list is not None: column_list = node.column_list # else we curate the column list from the metadata else: column_list = [] for column in table_ref_obj.columns: column_list.append( TupleValueExpression( col_name=column.name, table_alias=table_ref_obj.name.lower(), col_object=column)) # bind the columns for expr in column_list: self.bind(expr) node.column_list = column_list
def get_petastorm_column(df_column): column_type = df_column.type column_name = df_column.name column_is_nullable = df_column.is_nullable column_array_type = df_column.array_type column_array_dimensions = df_column.array_dimensions # Reference: # https://github.com/uber/petastorm/blob/master/petastorm/ # tests/test_common.py petastorm_column = None if column_type == ColumnType.INTEGER: petastorm_column = UnischemaField(column_name, np.int32, (), ScalarCodec(IntegerType()), column_is_nullable) elif column_type == ColumnType.FLOAT: petastorm_column = UnischemaField(column_name, np.float64, (), ScalarCodec(FloatType()), column_is_nullable) elif column_type == ColumnType.TEXT: petastorm_column = UnischemaField(column_name, np.str_, (), ScalarCodec(StringType()), column_is_nullable) elif column_type == ColumnType.NDARRAY: np_type = NdArrayType.to_numpy_type(column_array_type) petastorm_column = UnischemaField(column_name, np_type, column_array_dimensions, NdarrayCodec(), column_is_nullable) else: logger.error("Invalid column type: " + str(column_type)) return petastorm_column
def delete(self): """Delete and commit""" try: db_session.delete(self) self._commit() except Exception: logger.error("Object couldn't be deleted") raise Exception
def visitTableName(self, ctx: evaql_parser.TableNameContext): table_name = self.visit(ctx.fullId()) if table_name is not None: table_info = TableInfo(table_name=table_name) return table_info else: error = 'Invalid Table Name' logger.error(error)
def _commit(self): """Try to commit. If an error is raised, the session is rollbacked.""" try: db_session.commit() except DatabaseError: db_session.rollback() logger.error( "Exception occurred while committing to database.") raise Exception("Exception occurred while committing to database.")
def get_outputs_by_udf_id(self, udf_id: int): try: result = self.model.query \ .filter(self.model._udf_id == udf_id, self.model._is_input == False).all() # noqa return result except Exception as e: error = f'Getting outputs for UDF id {udf_id} raised {e}' logger.error(error) raise RuntimeError(error)
def _get_video_file_path(self, metadata_file): with open(metadata_file, 'rb') as f: (version, ) = struct.unpack('!H', f.read(struct.calcsize('!H'))) if version > self.curr_version: error = 'Invalid metadata version {}'.format(version) logger.error(error) raise RuntimeError(error) (length, ) = struct.unpack('!H', f.read(struct.calcsize('!H'))) path = f.read(length) return Path(path.decode())
def handle_if_not_exists(table_ref: TableRef, if_not_exist=False): if CatalogManager().check_table_exists(table_ref.table.database_name, table_ref.table.table_name): err_msg = 'Table: {} already exsits'.format(table_ref) if if_not_exist: logger.warn(err_msg) return True else: logger.error(err_msg) raise RuntimeError(err_msg) else: return False
def rename_dataset_by_name(self, new_name: str, curr_database_name: str, curr_dataset_name: str): try: dataset = self.dataset_object_by_name(curr_database_name, curr_dataset_name) dataset.update(_name=new_name) except Exception as e: err_msg = "Update dataset name failed for {} with error {}".format( curr_dataset_name, str(e)) logger.error(err_msg) raise RuntimeError(err_msg)
def create(self, table: DataFrameMetadata, video_file: Path): # Create directory to store video and metadata related to the video dir_path = Path(table.file_url) try: dir_path.mkdir(parents=True) shutil.copy2(str(video_file), str(dir_path)) except FileExistsError: error = 'Failed to load the video as directory \ already exists: {}'.format(dir_path) logger.error(error) raise FileExistsError(error) self._create_video_metadata(dir_path, video_file.name) return True
def save(self): """Add and commit Returns: saved object """ try: db_session.add(self) self._commit() except Exception as e: logger.error("Object already exists in database") raise e return self
def add_expr(self, expr: GroupExpression): if expr.group_id == UNDEFINED_GROUP_ID: expr.group_id = self.group_id if expr.group_id != self.group_id: logger.error('Expected group id {}, found {}'.format( self.group_id, expr.group_id)) return if expr.opr.is_logical(): self._add_logical_expr(expr) else: self._add_physical_expr(expr)
def visitUdfFunction(self, ctx: evaql_parser.UdfFunctionContext): udf_name = None udf_output = None if ctx.simpleId(): udf_name = self.visit(ctx.simpleId()) else: logger.error('UDF function name missing.') if ctx.dottedId(): udf_output = self.visit(ctx.dottedId()) udf_args = self.visit(ctx.functionArgs()) func_expr = FunctionExpression(None, name=udf_name, output=udf_output) for arg in udf_args: func_expr.append_child(arg) return func_expr
def drop_dataset_by_name(self, database_name: str, dataset_name: str): """Delete dataset from the db Arguments: database_name (str): Database to which dataset belongs dataset_name (str): name of the dataset Returns: True if successfully removed else false """ try: dataset = self.dataset_object_by_name(database_name, dataset_name) dataset.delete() except Exception as e: err_msg = "Delete dataset failed for name {} with error {}".format( dataset_name, str(e)) logger.error(err_msg) raise RuntimeError(err_msg)
def dataset_by_name(self, name: str) -> int: """ Returns metadata id for the name queried Arguments: name (str)- Name for which id is required Returns: int (dataset id) """ try: result = (self.model.query.with_entities( self.model._id).filter(self.model._name == name).one()) return result[0] except NoResultFound: logger.error("get_id_from_name failed with name {}".format(name))
def visitFullColumnName(self, ctx: evaql_parser.FullColumnNameContext): # Adding support for a.b # Will restrict implementation to raise error for a.b.c dottedIds = [] if ctx.dottedId(): if len(ctx.dottedId()) != 1: logger.error("Only tablename.colname syntax supported") return for id in ctx.dottedId(): dottedIds.append(self.visit(id)) uid = self.visit(ctx.uid()) if len(dottedIds): return TupleValueExpression(table_alias=uid, col_name=dottedIds[0]) else: return TupleValueExpression(col_name=uid)
def exec(self): """Create materialized view executor """ if not handle_if_not_exists(self.node.view, self.node.if_not_exists): child = self.children[0] # only support seq scan based materialization if child.node.opr_type != PlanOprType.SEQUENTIAL_SCAN: err_msg = 'Invalid query {}, expected {}'.format( child.node.opr_type, PlanOprType.SEQUENTIAL_SCAN) logger.error(err_msg) raise RuntimeError(err_msg) # gather child projected column objects child_objs = [] for child_col in child.project_expr: if child_col.etype == ExpressionType.TUPLE_VALUE: child_objs.append(child_col.col_object) elif child_col.etype == ExpressionType.FUNCTION_EXPRESSION: child_objs.extend(child_col.output_objs) # Number of projected columns should be equal to mat view columns if len(self.node.columns) != len(child_objs): err_msg = '# projected columns mismatch, expected {} found {}\ '.format(len(self.node.columns), len(child_objs)) logger.error(err_msg) raise RuntimeError(err_msg) col_defs = [] # Copy column type info from child columns for idx, child_col_obj in enumerate(child_objs): col = self.node.columns[idx] col_defs.append( ColumnDefinition(col.name, child_col_obj.type, child_col_obj.array_type, child_col_obj.array_dimensions)) view_metainfo = create_table_metadata(self.node.view, col_defs) StorageEngine.create(table=view_metainfo) # Populate the view for batch in child.exec(): batch.drop_column_alias() StorageEngine.write(view_metainfo, batch)
def visitCreateUdf(self, ctx: evaql_parser.CreateUdfContext): udf_name = None if_not_exists = False input_definitions = [] output_definitions = [] impl_path = None udf_type = None for child in ctx.children: try: if isinstance(child, TerminalNode): continue rule_idx = child.getRuleIndex() if rule_idx == evaql_parser.RULE_udfName: udf_name = self.visit(ctx.udfName()) elif rule_idx == evaql_parser.RULE_ifNotExists: if_not_exists = True elif rule_idx == evaql_parser.RULE_createDefinitions: # There should be 2 createDefinition # idx 0 describing udf INPUT # idx 1 describing udf OUTPUT if len(ctx.createDefinitions()) != 2: logger.error('UDF Input or Output Missing') input_definitions = self.visit(ctx.createDefinitions(0)) output_definitions = self.visit(ctx.createDefinitions(1)) elif rule_idx == evaql_parser.RULE_udfType: udf_type = self.visit(ctx.udfType()) elif rule_idx == evaql_parser.RULE_udfImpl: impl_path = self.visit(ctx.udfImpl()).value except BaseException: logger.error('CREATE UDF Failed') # stop parsing something bad happened return None stmt = CreateUDFStatement(udf_name, if_not_exists, input_definitions, output_definitions, impl_path, udf_type) return stmt
def bind_table_info(table_info: TableInfo) -> DataFrameMetadata: """ Uses catalog to bind the dataset information for given video string. Arguments: video_info (TableInfo): video information obtained in SQL query Returns: DataFrameMetadata - corresponding metadata for the input table info """ catalog = CatalogManager() obj = catalog.get_dataset_metadata(table_info.database_name, table_info.table_name) if obj: table_info.table_obj = obj else: error = '{} does not exists. Create the table using \ CREATE TABLE.'.format(table_info.table_name) logger.error(error) raise RuntimeError(error)
def path_to_class(filepath: str, classname: str): """ Convert the class in the path file into an object Arguments: filepath: absolute path of file classname: the name of the imported class Returns: type: A class for given path """ try: abs_path = Path(filepath).resolve() spec = importlib.util.spec_from_file_location(abs_path.stem, abs_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) classobj = getattr(module, classname) except Exception as e: logger.error('Failed to import %s from %s\nException: %s' % (classname, filepath, e)) return classobj
def visitQuerySpecification(self, ctx: evaql_parser.QuerySpecificationContext): target_list = None from_clause = None where_clause = None orderby_clause = None limit_count = None # first child will be a SELECT terminal token for child in ctx.children[1:]: try: rule_idx = child.getRuleIndex() if rule_idx == evaql_parser.RULE_selectElements: target_list = self.visit(child) elif rule_idx == evaql_parser.RULE_fromClause: clause = self.visit(child) from_clause = clause.get('from', None) where_clause = clause.get('where', None) elif rule_idx == evaql_parser.RULE_orderByClause: orderby_clause = self.visit(ctx.orderByClause()) elif rule_idx == evaql_parser.RULE_limitClause: limit_count = self.visit(ctx.limitClause()) except BaseException as e: # stop parsing something bad happened logger.error('Error while parsing \ visitQuerySpecification') raise e select_stmt = SelectStatement(target_list, from_clause, where_clause, orderby_clause_list=orderby_clause, limit_count=limit_count) return select_stmt
def column_definition_to_udf_io(col_list: List[ColumnDefinition], is_input: bool): """Create the UdfIO object fro each column definition provided Arguments: col_list(List[ColumnDefinition]): parsed input/output definitions is_input(bool): true if input else false """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: logger.error("Empty column definition while creating udf io") result_list.append(col) result_list.append(CatalogManager().udf_io(col.name, col.type, array_type=col.array_type, dimensions=col.dimension, is_input=is_input)) return result_list
def generate_file_path(name: str = '') -> Path: """Generates a arbitrary file_path(md5 hash) based on the a random salt and name Arguments: name (str): Input file_name. Returns: Path: pathlib.Path object """ dataset_location = ConfigurationManager().get_value("core", "datasets_dir") if dataset_location is None: logger.error('Missing location key in eva.yml') raise KeyError('Missing datasets_dir key in eva.yml') dataset_location = Path(dataset_location) dataset_location.mkdir(parents=True, exist_ok=True) salt = uuid.uuid4().hex file_name = hashlib.md5(salt.encode() + name.encode()).hexdigest() path = dataset_location / file_name return path.resolve()
def exec(self): """ Read the input video using opencv and persist data using storage engine """ video_file_path = None # Validate file_path if Path(self.node.file_path).exists(): video_file_path = self.node.file_path # check in the upload directory else: video_path = Path(self.upload_path / self.node.file_path) if video_path.exists(): video_file_path = video_path if video_file_path is None: error = "Failed to find a video file at location: {}".format( self.node.file_path) logger.error(error) raise RuntimeError(error) success = VideoStorageEngine.create(self.node.table_metainfo, video_file_path) # ToDo: Add logic for indexing the video file # Create an index of I frames to speed up random video seek if success: yield Batch( pd.DataFrame( { "Video successfully added at location: ": str(self.node.file_path) }, index=[0], ))
def create_column_metadata(col_list: List[ColumnDefinition]): """Create column metadata for the input parsed column list. This function will not commit the provided column into catalog table. Will only return in memory list of ColumnDataframe objects Arguments: col_list {List[ColumnDefinition]} -- parsed col list to be created """ if isinstance(col_list, ColumnDefinition): col_list = [col_list] result_list = [] for col in col_list: if col is None: logger.error( "Empty column while creating column metadata") result_list.append(col) result_list.append( CatalogManager().create_column_metadata( col.name, col.type, col.array_type, col.dimension ) ) return result_list
def get_group_by_id(self, group_id: int) -> GroupExpression: if group_id in self._groups.keys(): return self._groups[group_id] else: logger.error('Missing group id')