def parse_and_execute(self, parsed_args: Dict):
        table_name = parsed_args['table-name']
        client = create_client_from_parsed_args(parsed_args)
        connection = create_sql_connecting_from_parsed_args(parsed_args)

        result_df = connection.execute_query_and_save_result(
            parsed_args['query'])
        inferred_schema = AitoTableSchema.infer_from_pandas_data_frame(
            result_df)

        converted_tmp_file = tempfile.NamedTemporaryFile(mode='w',
                                                         suffix='.ndjson.gz',
                                                         delete=False)
        DataFrameHandler().df_to_format(result_df, 'ndjson',
                                        converted_tmp_file.name,
                                        {'compression': 'gzip'})
        converted_tmp_file.close()

        api.create_table(client, table_name, inferred_schema)
        with open(converted_tmp_file.name, 'rb') as in_f:
            api.upload_binary_file(client=client,
                                   table_name=table_name,
                                   binary_file=in_f)
        converted_tmp_file.close()
        unlink(converted_tmp_file.name)
        return 0
Exemplo n.º 2
0
    def parse_and_execute(self, parsed_args: Dict):
        parsed_convert_args = self.parsed_args_to_data_frame_handler_convert_args(parsed_args)
        output_schema_path = parsed_args['create_table_schema'] if parsed_args['create_table_schema'] else None

        converted_df = DataFrameHandler().convert_file(**parsed_convert_args)
        if output_schema_path:
            inferred_schema = AitoTableSchema.infer_from_pandas_data_frame(converted_df)
            with output_schema_path.open(mode='w') as f:
                json.dump(inferred_schema.to_json_serializable(), f, indent=2, sort_keys=True)
        return 0
Exemplo n.º 3
0
    def convert_file(
            self,
            read_input: FilePathOrBuffer,
            write_output: FilePathOrBuffer,
            in_format: str,
            out_format: str,
            read_options: Dict = None,
            convert_options: Dict = None,
            apply_functions: List[Callable[..., pd.DataFrame]] = None,
            use_table_schema: Union[AitoTableSchema,
                                    Dict] = None) -> pd.DataFrame:
        """Converting input file to expected format, generate or use Aito table schema if specified

        :param read_input: read input
        :type read_input: any valid string path, pathlike object, or file-like object (objects with a read() method)
        :param write_output: write output
        :type write_output: any valid string path, pathlike object, or file-like object (objects with a read() method)
        :param in_format: input format
        :type in_format: str
        :param out_format: output format
        :type out_format: str
        :param read_options: dictionary contains arguments for pandas read function, defaults to None
        :type read_options: Dict, optional
        :param convert_options: dictionary contains arguments for pandas write function, defaults to None
        :type convert_options: Dict, optional
        :param apply_functions: list of partial functions that will be applied to the loaded pd.DataFrame, defaults to None
        :type apply_functions: List[Callable[..., pd.DataFrame]], optional
        :param use_table_schema: use an aito schema to dictates data types and convert the data, defaults to None
        :type use_table_schema: an AitoTableSchema object or a Dict, optional
        :return: converted DataFrame
        :rtype: pd.DataFrame
        """
        self._validate_in_out_format(in_format, out_format)

        df = self.read_file_to_df(read_input, in_format, read_options)

        if apply_functions:
            apply_functions = self.default_apply_functions + apply_functions
        else:
            apply_functions = self.default_apply_functions
        df = self._apply_functions_on_df(df, apply_functions)

        # if no schema is predefined, infer the schema and use the inferred schema
        used_table_schema = use_table_schema
        if used_table_schema is None:
            used_table_schema = AitoTableSchema.infer_from_pandas_data_frame(
                df)

        df = self.convert_df_using_aito_table_schema(df, used_table_schema)

        if out_format != in_format or convert_options or use_table_schema:
            self.df_to_format(df, out_format, write_output, convert_options)
        return df
Exemplo n.º 4
0
    def convert_df_using_aito_table_schema(
            df: pd.DataFrame, table_schema: Union[AitoTableSchema,
                                                  Dict]) -> pd.DataFrame:
        """convert a pandas DataFrame to match a given Aito table schema

        :param df: input pandas DataFrame
        :type df: pd.DataFrame
        :param table_schema: input table schema
        :type table_schema: an AitoTableSchema object or a Dict, optional
        :raises ValueError: input table schema is invalid
        :raises e: failed to convert
        :return: converted DataFrame
        :rtype: pd.DataFrame
        """
        if not isinstance(table_schema, AitoTableSchema):
            if not isinstance(table_schema, dict):
                raise ValueError(
                    "the input table schema must be either an AitoTableSchema object or a dict"
                )
            table_schema = AitoTableSchema.from_deserialized_object(
                table_schema)

        df_columns = set(df.columns.values)
        table_schema_columns = set(table_schema.columns)

        for col_name in (df_columns - table_schema_columns):
            LOG.warning(
                f"column `{col_name}` found in the input data but not found in the input schema"
            )
        for col_name in (table_schema_columns - df_columns):
            LOG.warning(
                f"column `{col_name}` found in the input schema but not found in the input data"
            )

        conversion_map = {}
        for col_name in table_schema_columns.intersection(df_columns):
            col_schema = table_schema[col_name]
            col_df_nullable = df[col_name].isna().any()
            if col_df_nullable and not col_schema.nullable:
                raise ValueError(
                    f"column `{col_name}` is nullable but stated non-nullable in the input schema"
                )
            conversion_map[col_name] = col_schema.to_conversion()

        LOG.debug(f"casting dataframe columns: {conversion_map}")
        converted_df = df
        for col_name in conversion_map:
            conversion = conversion_map[col_name]
            converted_df[col_name] = converted_df[col_name].apply(conversion)

        LOG.debug(f"converted the dataframe according to the schema")
        return converted_df
    def test_get_table(self):
        self.create_table()
        expected_args = {
            'command': 'get-table',
            'table-name': self.default_table_name,
            **self.default_parser_args
        }
        with self.out_file_path.open('w') as out_f:
            self.parse_and_execute(['get-table', self.default_table_name], expected_args, stub_stdout=out_f)

        with self.out_file_path.open() as f:
            returned_content = json.load(f)
        returned_table_schema = AitoTableSchema.from_deserialized_object(returned_content)
        self.assertEqual(returned_table_schema, self.default_table_schema)
 def setUpClass(cls):
     super().setUpClass()
     cls.input_folder = cls.input_folder.parent.parent / 'sample_invoice'
     cls.default_parser_args = {
         'verbose': False, 'version': False, 'quiet': False,
         'profile': 'default', 'api_key': '.env', 'instance_url': '.env'
     }
     cls.client = default_client()
     with (cls.input_folder / "invoice_aito_schema.json").open() as f:
         json_schema = json.load(f)
     cls.default_table_schema = AitoTableSchema.from_deserialized_object(json_schema)
     cls.default_table_name = f"invoice_{str(uuid4()).replace('-', '_')}"
     with (cls.input_folder / "invoice_no_null_value.json").open() as f:
         cls.default_entries = json.load(f)
Exemplo n.º 7
0
def quick_add_table(
        client: AitoClient, input_file: Union[Path, PathLike], table_name: str = None, input_format: str = None
):
    """Create a table and upload a file to the table, using the default inferred schema

    :param client: the AitoClient instance
    :type client: AitoClient
    :param input_file: path to the input file to be uploaded
    :type input_file: Union[Path, PathLike]
    :param table_name: the name of the table, defaults to the name of the input file
    :type table_name: Optional[str]
    :param input_format: specify the format of the input file, defaults to the input file extension
    :type input_format: Optional[str]
    """
    df_handler = DataFrameHandler()

    try:
        in_f_path = Path(input_file)
    except Exception:
        raise ValueError(f'invalid path: {input_file}')
    in_format = in_f_path.suffixes[0].replace('.', '') if input_format is None else input_format
    if in_format not in df_handler.allowed_format:
        raise ValueError(f'invalid file format {in_format}. Must be one of {"|".join(df_handler.allowed_format)}')

    table_name = in_f_path.stem if table_name is None else table_name

    converted_tmp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.ndjson.gz', delete=False)

    converted_df = df_handler.convert_file(
        read_input=in_f_path,
        write_output=converted_tmp_file.name,
        in_format=in_format,
        out_format='ndjson',
        convert_options={'compression': 'gzip'}
    )
    converted_tmp_file.close()

    inferred_schema = AitoTableSchema.infer_from_pandas_data_frame(converted_df)
    create_table(client, table_name, inferred_schema)

    with open(converted_tmp_file.name, 'rb') as in_f:
        upload_binary_file(client=client, table_name=table_name, binary_file=in_f)
    converted_tmp_file.close()
    unlink(converted_tmp_file.name)
Exemplo n.º 8
0
    def test_create_and_delete_column(self):
        self.addCleanup(self.delete_default_table_and_check)
        self.create_default_table_and_check()
        api.create_column(
            client=self.client, table_name=self.default_table_name, column_name='new_col', schema={'type': 'Int'}
        )

        updated_tbl_schema = AitoTableSchema.from_deserialized_object(self.default_table_schema)
        new_col_schema = AitoColumnTypeSchema(data_type=AitoIntType())
        updated_tbl_schema['new_col'] = new_col_schema

        instance_tbl_schema = api.get_table_schema(client=self.client, table_name=self.default_table_name)
        self.assertEqual(updated_tbl_schema, instance_tbl_schema)

        instance_col_schema = api.get_column_schema(
            client=self.client, table_name=self.default_table_name, column_name='new_col'
        )
        self.assertEqual(new_col_schema, instance_col_schema)

        api.delete_column(client=self.client, table_name=self.default_table_name, column_name='new_col')
        self.get_default_table_schema_and_check()
Exemplo n.º 9
0
class TestClientRequest(CompareTestCase):
    @parameterized.expand([
        ('search', 'POST', '/api/v1/_search', {}, aito_requests.SearchRequest({}), None),
        ('predict', 'POST', '/api/v1/_predict', {}, aito_requests.PredictRequest({}), None),
        ('recommend', 'POST', '/api/v1/_recommend', {}, aito_requests.RecommendRequest({}), None),
        ('evaluate', 'POST', '/api/v1/_evaluate', {}, aito_requests.EvaluateRequest({}), None),
        ('similarity', 'POST', '/api/v1/_similarity', {}, aito_requests.SimilarityRequest({}), None),
        ('relate', 'POST', '/api/v1/_relate', {}, aito_requests.RelateRequest({}), None),
        ('query', 'POST', '/api/v1/_query', {}, aito_requests.GenericQueryRequest({}), None),
        ('get_database_schema', 'GET', '/api/v1/schema', {}, aito_requests.GetDatabaseSchemaRequest(), None),
        (
                'get_table_schema', 'GET', '/api/v1/schema/table_name', {},
                aito_requests.GetTableSchemaRequest(table_name='table_name'), None),
        (
                'get_column_schema', 'GET', '/api/v1/schema/table_name/column_name', {},
                aito_requests.GetColumnSchemaRequest(table_name='table_name', column_name='column_name'), None),
        (
                'create_database_schema', 'PUT', '/api/v1/schema',
                {'schema': {'tbl': {'type': 'table', 'columns': {'col1': {'type': 'String', 'nullable': False}}}}},
                aito_requests.CreateDatabaseSchemaRequest(schema=AitoDatabaseSchema(
                    tables={'tbl': AitoTableSchema(
                        columns={'col1': AitoColumnTypeSchema(data_type=AitoStringType())}
                    )}
                )),
                None
        ),
        (
                'create_table_schema', 'PUT', '/api/v1/schema/table_name',
                {'type': 'table', 'columns': {'col1': {'type': 'String', 'nullable': False}}},
                aito_requests.CreateTableSchemaRequest(
                    table_name='table_name',
                    schema=AitoTableSchema(columns={'col1': AitoColumnTypeSchema(data_type=AitoStringType())})
                ),
                None
        ),
        (
                'create_column_schema', 'PUT', '/api/v1/schema/table_name/column_name',
                {'type': 'String', 'nullable': False},
                aito_requests.CreateColumnSchemaRequest(
                    table_name='table_name', column_name='column_name', schema={'type': 'String', 'nullable': False}
                ),
                None
        ),
        ('delete_database_schema', 'DELETE', '/api/v1/schema', {}, aito_requests.DeleteDatabaseSchemaRequest(), None),
        (
                'delete_table_schema', 'DELETE', '/api/v1/schema/table_name', {},
                aito_requests.DeleteTableSchemaRequest(table_name='table_name'), None),
        (
                'delete_column_schema', 'DELETE', '/api/v1/schema/table_name/column_name', {},
                aito_requests.DeleteColumnSchemaRequest(table_name='table_name', column_name='column_name'), None
        ),
        (
                'init_file_upload', 'POST', '/api/v1/data/table_name/file', {},
                aito_requests.InitiateFileUploadRequest(table_name='table_name'), None
        ),
        (
                'trigger_file_processing', 'POST', '/api/v1/data/table_name/file/00000000-0000-0000-0000-000000000000', {},
                aito_requests.TriggerFileProcessingRequest(table_name='table_name', session_id='00000000-0000-0000-0000-000000000000'), None
        ),
        (
                'get_file_processing_status', 'GET', '/api/v1/data/table_name/file/00000000-0000-0000-0000-000000000000', {},
                aito_requests.GetFileProcessingRequest(table_name='table_name', session_id='00000000-0000-0000-0000-000000000000'), None
        ),
        (
                'create_job', 'POST', '/api/v1/jobs/_search', {},
                aito_requests.CreateJobRequest(endpoint='/api/v1/jobs/_search', query={}), None
        ),
        (
                'get_job_status', 'GET', '/api/v1/jobs/00000000-0000-0000-0000-000000000000', {},
                aito_requests.GetJobStatusRequest(job_id='00000000-0000-0000-0000-000000000000'), None
        ),
        (
                'get_job_result', 'GET', '/api/v1/jobs/00000000-0000-0000-0000-000000000000/result', {},
                aito_requests.GetJobResultRequest(job_id='00000000-0000-0000-0000-000000000000'), None
        ),
        ('erroneous_method', 'PATCH', '/api/v1/schema', {}, None, ValueError),
        ('erroneous_endpoint', 'GET', 'api/v1/schema', {}, None, ValueError),
    ])
    def test_make_request(self, _, method, endpoint, query, expected_request_instance, error):
        if error:
            with self.assertRaises(error):
                aito_requests.AitoRequest.make_request(method=method, endpoint=endpoint, query=query)
        else:
            req = aito_requests.AitoRequest.make_request(method=method, endpoint=endpoint, query=query)
            self.assertEqual(req, expected_request_instance)

    def test_base_request_erroneous_method(self):
        with self.assertRaises(ValueError):
            aito_requests.BaseRequest('PATCH', '/api/v1/schema')

    def test_base_request_erroneous_endpoint(self):
        with self.assertRaises(ValueError):
            aito_requests.BaseRequest('GET', 'api/v1/schema')
Exemplo n.º 10
0
 def get_default_table_schema_and_check(self):
     instance_tbl_schema = api.get_table_schema(self.client, self.default_table_name)
     self.assertEqual(AitoTableSchema.from_deserialized_object(self.default_table_schema), instance_tbl_schema)
Exemplo n.º 11
0
 def parse_and_execute(self, parsed_args: Dict):
     connection = create_sql_connecting_from_parsed_args(parsed_args)
     result_df = connection.execute_query_and_save_result(parsed_args['query'])
     inferred_schema = AitoTableSchema.infer_from_pandas_data_frame(result_df)
     json.dump(inferred_schema.to_json_serializable(), sys.stdout, indent=4, sort_keys=True)
     return 0
Exemplo n.º 12
0
 def parse_and_execute(self, parsed_args: Dict):
     parsed_read_args = self.parsed_args_to_data_frame_handler_read_args(parsed_args)
     df = DataFrameHandler().read_file_to_df(**parsed_read_args)
     inferred_schema = AitoTableSchema.infer_from_pandas_data_frame(df)
     json.dump(inferred_schema.to_json_serializable(), sys.stdout, indent=4, sort_keys=True)
     return 0