Exemplo n.º 1
0
    def sort_orderby(self, by, sort_type=None):
        """
        in_place sort for orderby

        Args:
            by: list of column names
            sort_type: list of True/False if ASC for each column name in 'by'
                i.e [True, False] means [ASC, DESC]
        """
        # if by is None and self.identifier_column in self._frames:
        #     by = [self.identifier_column]

        if sort_type is None:
            sort_type = [True]

        if by is not None:
            for column in by:
                if column not in self._frames.columns:
                    logger.error(
                        'Can not orderby non-projected column: {}'.format(
                            column))
                    raise KeyError(
                        'Can not orderby non-projected column: {}'.format(
                            column))

            self._frames.sort_values(by,
                                     ascending=sort_type,
                                     ignore_index=True,
                                     inplace=True)
        else:
            logger.warn('Columns and Sort Type are required for orderby')
Exemplo n.º 2
0
    def _bind_load_data_statement(self, node: LoadDataStatement):
        table_ref = node.table_ref
        if node.file_options['file_format'] == FileFormatType.VIDEO:
            # Create a new metadata object
            create_video_metadata(table_ref.table.table_name)

        self.bind(table_ref)

        table_ref_obj = table_ref.table.table_obj
        if table_ref_obj is None:
            error = '{} does not exists. Create the table using \
                            CREATE TABLE.'.format(table_ref.table.table_name)
            logger.error(error)
            raise RuntimeError(error)

        # if query had columns specified, we just copy them
        if node.column_list is not None:
            column_list = node.column_list

        # else we curate the column list from the metadata
        else:
            column_list = []
            for column in table_ref_obj.columns:
                column_list.append(
                    TupleValueExpression(
                        col_name=column.name,
                        table_alias=table_ref_obj.name.lower(),
                        col_object=column))

        # bind the columns
        for expr in column_list:
            self.bind(expr)

        node.column_list = column_list
Exemplo n.º 3
0
    def get_petastorm_column(df_column):

        column_type = df_column.type
        column_name = df_column.name
        column_is_nullable = df_column.is_nullable
        column_array_type = df_column.array_type
        column_array_dimensions = df_column.array_dimensions

        # Reference:
        # https://github.com/uber/petastorm/blob/master/petastorm/
        # tests/test_common.py

        petastorm_column = None
        if column_type == ColumnType.INTEGER:
            petastorm_column = UnischemaField(column_name, np.int32, (),
                                              ScalarCodec(IntegerType()),
                                              column_is_nullable)
        elif column_type == ColumnType.FLOAT:
            petastorm_column = UnischemaField(column_name, np.float64, (),
                                              ScalarCodec(FloatType()),
                                              column_is_nullable)
        elif column_type == ColumnType.TEXT:
            petastorm_column = UnischemaField(column_name, np.str_, (),
                                              ScalarCodec(StringType()),
                                              column_is_nullable)
        elif column_type == ColumnType.NDARRAY:
            np_type = NdArrayType.to_numpy_type(column_array_type)
            petastorm_column = UnischemaField(column_name, np_type,
                                              column_array_dimensions,
                                              NdarrayCodec(),
                                              column_is_nullable)
        else:
            logger.error("Invalid column type: " + str(column_type))

        return petastorm_column
Exemplo n.º 4
0
 def delete(self):
     """Delete and commit"""
     try:
         db_session.delete(self)
         self._commit()
     except Exception:
         logger.error("Object couldn't be deleted")
         raise Exception
Exemplo n.º 5
0
    def visitTableName(self, ctx: evaql_parser.TableNameContext):

        table_name = self.visit(ctx.fullId())
        if table_name is not None:
            table_info = TableInfo(table_name=table_name)
            return table_info
        else:
            error = 'Invalid Table Name'
            logger.error(error)
Exemplo n.º 6
0
 def _commit(self):
     """Try to commit. If an error is raised, the session is rollbacked."""
     try:
         db_session.commit()
     except DatabaseError:
         db_session.rollback()
         logger.error(
             "Exception occurred while committing to database.")
         raise Exception("Exception occurred while committing to database.")
Exemplo n.º 7
0
 def get_outputs_by_udf_id(self, udf_id: int):
     try:
         result = self.model.query \
             .filter(self.model._udf_id == udf_id,
                     self.model._is_input == False).all()  # noqa
         return result
     except Exception as e:
         error = f'Getting outputs for UDF id {udf_id} raised {e}'
         logger.error(error)
         raise RuntimeError(error)
Exemplo n.º 8
0
 def _get_video_file_path(self, metadata_file):
     with open(metadata_file, 'rb') as f:
         (version, ) = struct.unpack('!H', f.read(struct.calcsize('!H')))
         if version > self.curr_version:
             error = 'Invalid metadata version {}'.format(version)
             logger.error(error)
             raise RuntimeError(error)
         (length, ) = struct.unpack('!H', f.read(struct.calcsize('!H')))
         path = f.read(length)
         return Path(path.decode())
Exemplo n.º 9
0
def handle_if_not_exists(table_ref: TableRef, if_not_exist=False):
    if CatalogManager().check_table_exists(table_ref.table.database_name,
                                           table_ref.table.table_name):
        err_msg = 'Table: {} already exsits'.format(table_ref)
        if if_not_exist:
            logger.warn(err_msg)
            return True
        else:
            logger.error(err_msg)
            raise RuntimeError(err_msg)
    else:
        return False
Exemplo n.º 10
0
    def rename_dataset_by_name(self, new_name: str, curr_database_name: str,
                               curr_dataset_name: str):
        try:
            dataset = self.dataset_object_by_name(curr_database_name,
                                                  curr_dataset_name)
            dataset.update(_name=new_name)

        except Exception as e:
            err_msg = "Update dataset name failed for {} with error {}".format(
                curr_dataset_name, str(e))
            logger.error(err_msg)
            raise RuntimeError(err_msg)
Exemplo n.º 11
0
 def create(self, table: DataFrameMetadata, video_file: Path):
     # Create directory to store video and metadata related to the video
     dir_path = Path(table.file_url)
     try:
         dir_path.mkdir(parents=True)
         shutil.copy2(str(video_file), str(dir_path))
     except FileExistsError:
         error = 'Failed to load the video as directory \
                     already exists: {}'.format(dir_path)
         logger.error(error)
         raise FileExistsError(error)
     self._create_video_metadata(dir_path, video_file.name)
     return True
Exemplo n.º 12
0
    def save(self):
        """Add and commit

        Returns: saved object

        """
        try:
            db_session.add(self)
            self._commit()
        except Exception as e:
            logger.error("Object already exists in database")
            raise e
        return self
Exemplo n.º 13
0
    def add_expr(self, expr: GroupExpression):
        if expr.group_id == UNDEFINED_GROUP_ID:
            expr.group_id = self.group_id

        if expr.group_id != self.group_id:
            logger.error('Expected group id {}, found {}'.format(
                self.group_id, expr.group_id))
            return

        if expr.opr.is_logical():
            self._add_logical_expr(expr)
        else:
            self._add_physical_expr(expr)
Exemplo n.º 14
0
    def visitUdfFunction(self, ctx: evaql_parser.UdfFunctionContext):
        udf_name = None
        udf_output = None
        if ctx.simpleId():
            udf_name = self.visit(ctx.simpleId())
        else:
            logger.error('UDF function name missing.')
        if ctx.dottedId():
            udf_output = self.visit(ctx.dottedId())

        udf_args = self.visit(ctx.functionArgs())
        func_expr = FunctionExpression(None, name=udf_name, output=udf_output)
        for arg in udf_args:
            func_expr.append_child(arg)

        return func_expr
Exemplo n.º 15
0
 def drop_dataset_by_name(self, database_name: str, dataset_name: str):
     """Delete dataset from the db
     Arguments:
         database_name  (str): Database to which dataset belongs
         dataset_name (str): name of the dataset
     Returns:
         True if successfully removed else false
     """
     try:
         dataset = self.dataset_object_by_name(database_name, dataset_name)
         dataset.delete()
     except Exception as e:
         err_msg = "Delete dataset failed for name {} with error {}".format(
             dataset_name, str(e))
         logger.error(err_msg)
         raise RuntimeError(err_msg)
Exemplo n.º 16
0
    def dataset_by_name(self, name: str) -> int:
        """
        Returns metadata id for the name queried

        Arguments:
            name (str)- Name for which id is required

        Returns:
            int (dataset id)
        """
        try:
            result = (self.model.query.with_entities(
                self.model._id).filter(self.model._name == name).one())
            return result[0]
        except NoResultFound:
            logger.error("get_id_from_name failed with name {}".format(name))
Exemplo n.º 17
0
    def visitFullColumnName(self, ctx: evaql_parser.FullColumnNameContext):
        # Adding support for a.b
        # Will restrict implementation to raise error for a.b.c
        dottedIds = []
        if ctx.dottedId():
            if len(ctx.dottedId()) != 1:
                logger.error("Only tablename.colname syntax supported")
                return
            for id in ctx.dottedId():
                dottedIds.append(self.visit(id))

        uid = self.visit(ctx.uid())

        if len(dottedIds):
            return TupleValueExpression(table_alias=uid, col_name=dottedIds[0])
        else:
            return TupleValueExpression(col_name=uid)
Exemplo n.º 18
0
    def exec(self):
        """Create materialized view executor
        """
        if not handle_if_not_exists(self.node.view, self.node.if_not_exists):
            child = self.children[0]
            # only support seq scan based materialization
            if child.node.opr_type != PlanOprType.SEQUENTIAL_SCAN:
                err_msg = 'Invalid query {}, expected {}'.format(
                    child.node.opr_type, PlanOprType.SEQUENTIAL_SCAN)

                logger.error(err_msg)
                raise RuntimeError(err_msg)

            # gather child projected column objects
            child_objs = []
            for child_col in child.project_expr:
                if child_col.etype == ExpressionType.TUPLE_VALUE:
                    child_objs.append(child_col.col_object)
                elif child_col.etype == ExpressionType.FUNCTION_EXPRESSION:
                    child_objs.extend(child_col.output_objs)

            # Number of projected columns should be equal to mat view columns
            if len(self.node.columns) != len(child_objs):
                err_msg = '# projected columns mismatch, expected {} found {}\
                '.format(len(self.node.columns), len(child_objs))
                logger.error(err_msg)
                raise RuntimeError(err_msg)

            col_defs = []
            # Copy column type info from child columns
            for idx, child_col_obj in enumerate(child_objs):
                col = self.node.columns[idx]
                col_defs.append(
                    ColumnDefinition(col.name, child_col_obj.type,
                                     child_col_obj.array_type,
                                     child_col_obj.array_dimensions))

            view_metainfo = create_table_metadata(self.node.view, col_defs)
            StorageEngine.create(table=view_metainfo)

            # Populate the view
            for batch in child.exec():
                batch.drop_column_alias()
                StorageEngine.write(view_metainfo, batch)
Exemplo n.º 19
0
    def visitCreateUdf(self, ctx: evaql_parser.CreateUdfContext):
        udf_name = None
        if_not_exists = False
        input_definitions = []
        output_definitions = []
        impl_path = None
        udf_type = None

        for child in ctx.children:
            try:
                if isinstance(child, TerminalNode):
                    continue
                rule_idx = child.getRuleIndex()

                if rule_idx == evaql_parser.RULE_udfName:
                    udf_name = self.visit(ctx.udfName())

                elif rule_idx == evaql_parser.RULE_ifNotExists:
                    if_not_exists = True

                elif rule_idx == evaql_parser.RULE_createDefinitions:
                    # There should be 2 createDefinition
                    # idx 0 describing udf INPUT
                    # idx 1 describing udf OUTPUT
                    if len(ctx.createDefinitions()) != 2:
                        logger.error('UDF Input or Output Missing')
                    input_definitions = self.visit(ctx.createDefinitions(0))
                    output_definitions = self.visit(ctx.createDefinitions(1))

                elif rule_idx == evaql_parser.RULE_udfType:
                    udf_type = self.visit(ctx.udfType())

                elif rule_idx == evaql_parser.RULE_udfImpl:
                    impl_path = self.visit(ctx.udfImpl()).value

            except BaseException:
                logger.error('CREATE UDF Failed')
                # stop parsing something bad happened
                return None
        stmt = CreateUDFStatement(udf_name, if_not_exists, input_definitions,
                                  output_definitions, impl_path, udf_type)
        return stmt
Exemplo n.º 20
0
def bind_table_info(table_info: TableInfo) -> DataFrameMetadata:
    """
    Uses catalog to bind the dataset information for given video string.

    Arguments:
         video_info (TableInfo): video information obtained in SQL query

    Returns:
        DataFrameMetadata  -  corresponding metadata for the input table info
    """
    catalog = CatalogManager()
    obj = catalog.get_dataset_metadata(table_info.database_name,
                                       table_info.table_name)
    if obj:
        table_info.table_obj = obj
    else:
        error = '{} does not exists. Create the table using \
                        CREATE TABLE.'.format(table_info.table_name)
        logger.error(error)
        raise RuntimeError(error)
Exemplo n.º 21
0
def path_to_class(filepath: str, classname: str):
    """
    Convert the class in the path file into an object

    Arguments:
        filepath: absolute path of file
        classname: the name of the imported class

    Returns:
        type: A class for given path
    """
    try:
        abs_path = Path(filepath).resolve()
        spec = importlib.util.spec_from_file_location(abs_path.stem, abs_path)
        module = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module)
        classobj = getattr(module, classname)
    except Exception as e:
        logger.error('Failed to import %s from %s\nException: %s' %
                     (classname, filepath, e))
    return classobj
Exemplo n.º 22
0
    def visitQuerySpecification(self,
                                ctx: evaql_parser.QuerySpecificationContext):
        target_list = None
        from_clause = None
        where_clause = None
        orderby_clause = None
        limit_count = None

        # first child will be a SELECT terminal token

        for child in ctx.children[1:]:
            try:
                rule_idx = child.getRuleIndex()
                if rule_idx == evaql_parser.RULE_selectElements:
                    target_list = self.visit(child)

                elif rule_idx == evaql_parser.RULE_fromClause:
                    clause = self.visit(child)
                    from_clause = clause.get('from', None)
                    where_clause = clause.get('where', None)

                elif rule_idx == evaql_parser.RULE_orderByClause:
                    orderby_clause = self.visit(ctx.orderByClause())

                elif rule_idx == evaql_parser.RULE_limitClause:
                    limit_count = self.visit(ctx.limitClause())

            except BaseException as e:
                # stop parsing something bad happened
                logger.error('Error while parsing \
                                visitQuerySpecification')
                raise e

        select_stmt = SelectStatement(target_list,
                                      from_clause,
                                      where_clause,
                                      orderby_clause_list=orderby_clause,
                                      limit_count=limit_count)

        return select_stmt
Exemplo n.º 23
0
def column_definition_to_udf_io(col_list: List[ColumnDefinition],
                                is_input: bool):
    """Create the UdfIO object fro each column definition provided

    Arguments:
        col_list(List[ColumnDefinition]): parsed input/output definitions
        is_input(bool): true if input else false
    """
    if isinstance(col_list, ColumnDefinition):
        col_list = [col_list]

    result_list = []
    for col in col_list:
        if col is None:
            logger.error("Empty column definition while creating udf io")
            result_list.append(col)
        result_list.append(CatalogManager().udf_io(col.name,
                                                   col.type,
                                                   array_type=col.array_type,
                                                   dimensions=col.dimension,
                                                   is_input=is_input))
    return result_list
Exemplo n.º 24
0
def generate_file_path(name: str = '') -> Path:
    """Generates a arbitrary file_path(md5 hash) based on the a random salt
    and name

    Arguments:
        name (str): Input file_name.

    Returns:
        Path: pathlib.Path object

    """
    dataset_location = ConfigurationManager().get_value("core", "datasets_dir")
    if dataset_location is None:
        logger.error('Missing location key in eva.yml')
        raise KeyError('Missing datasets_dir key in eva.yml')

    dataset_location = Path(dataset_location)
    dataset_location.mkdir(parents=True, exist_ok=True)
    salt = uuid.uuid4().hex
    file_name = hashlib.md5(salt.encode() + name.encode()).hexdigest()
    path = dataset_location / file_name
    return path.resolve()
Exemplo n.º 25
0
    def exec(self):
        """
        Read the input video using opencv and persist data
        using storage engine
        """

        video_file_path = None
        # Validate file_path
        if Path(self.node.file_path).exists():
            video_file_path = self.node.file_path
        # check in the upload directory
        else:
            video_path = Path(self.upload_path / self.node.file_path)
            if video_path.exists():
                video_file_path = video_path

        if video_file_path is None:
            error = "Failed to find a video file at location: {}".format(
                self.node.file_path)
            logger.error(error)
            raise RuntimeError(error)

        success = VideoStorageEngine.create(self.node.table_metainfo,
                                            video_file_path)

        # ToDo: Add logic for indexing the video file
        # Create an index of I frames to speed up random video seek
        if success:
            yield Batch(
                pd.DataFrame(
                    {
                        "Video successfully added at location: ":
                        str(self.node.file_path)
                    },
                    index=[0],
                ))
Exemplo n.º 26
0
def create_column_metadata(col_list: List[ColumnDefinition]):
    """Create column metadata for the input parsed column list. This function
    will not commit the provided column into catalog table.
    Will only return in memory list of ColumnDataframe objects

    Arguments:
        col_list {List[ColumnDefinition]} -- parsed col list to be created
    """
    if isinstance(col_list, ColumnDefinition):
        col_list = [col_list]

    result_list = []
    for col in col_list:
        if col is None:
            logger.error(
                "Empty column while creating column metadata")
            result_list.append(col)
        result_list.append(
            CatalogManager().create_column_metadata(
                col.name, col.type, col.array_type, col.dimension
            )
        )

    return result_list
Exemplo n.º 27
0
 def get_group_by_id(self, group_id: int) -> GroupExpression:
     if group_id in self._groups.keys():
         return self._groups[group_id]
     else:
         logger.error('Missing group id')