Пример #1
0
 def __init__(self, schema):
     self.schema = schema
     self.database = schema.schema_name + ".db"
     self.engine = Sqlite3Engine(self.database)
     self.engine.autocommit(False)
     self.metadata = MetaData()
     self.create_all_tables()
Пример #2
0
def Metadata_reflect_unittest():
    """get the metadata from existing sqlite database
    """
    engine = Sqlite3Engine("movies.db")
    meta = MetaData()
    meta.reflect(engine)
    
    print(meta.tables["movie"].__dict__)
    for column in meta.tables["movie"].columns.values():
        print(repr(column))
Пример #3
0
def Metadata_reflect_unittest():
    """get the metadata from existing sqlite database
    """
    engine = Sqlite3Engine("movies.db")
    meta = MetaData()
    meta.reflect(engine)

    print(meta.tables["movie"].__dict__)
    for column in meta.tables["movie"].columns.values():
        print(repr(column))
Пример #4
0
 def __init__(self, schema):
     self.schema = schema
     self.database = schema.schema_name + ".db"
     self.engine = Sqlite3Engine(self.database)
     self.engine.autocommit(False)
     self.metadata = MetaData()
     self.create_all_tables()
Пример #5
0
 def __init__(self, dbname):
     self.engine = Sqlite3Engine(dbname)
     self.metadata = MetaData()
     self.pipeline = deque()
     self.timewrapper = TimeWrapper()
     self.messenger = Messenger()
     self.log = Log()
Пример #6
0
    def create_all_tables(self):
        ## create the main table
        self.metadata = MetaData()
        main_table_columns = [Column(field.field_name, field.sqlite_dtype,
                                 primary_key = field.primary_key,
                                 nullable = field.nullable,
                                 default = field.default,) \
                              for field in self.schema.fields.values()]

        self.main_table = Table(self.schema.schema_name, self.metadata,
                                *main_table_columns)

        ## create keyword table
        for keyword_field in self.schema.keyword_fields:
            Table(
                keyword_field,
                self.metadata,
                Column("keyword", datatype.text, primary_key=True),
                Column("uuid_set", datatype.pythonset),
            )

        self.metadata.create_all(self.engine)
Пример #7
0
 def create_all_tables(self):
     ## create the main table
     self.metadata = MetaData()
     main_table_columns = [Column(field.field_name, field.sqlite_dtype,
                              primary_key = field.primary_key,
                              nullable = field.nullable,
                              default = field.default,) \
                           for field in self.schema.fields.values()]
     
     self.main_table = Table(self.schema.schema_name, self.metadata, *main_table_columns)
     
     ## create keyword table
     for keyword_field in self.schema.keyword_fields:
         Table(keyword_field, self.metadata,
               Column("keyword", datatype.text, primary_key = True),
               Column("uuid_set", datatype.pythonset),
               )
     
     self.metadata.create_all(self.engine)
Пример #8
0
from angora.GADGET.pytimer import Timer
import random
import os

fm = FormatMaster()
try:
    os.remove("performance.db")
except:
    pass

# engine = Sqlite3Engine("performance.db")
engine = Sqlite3Engine(":memory:")
conn = engine.connect
c = engine.cursor

metadata = MetaData()
datatype = DataType()
col_ID = Column("ID", datatype.integer, primary_key=True)
col_name = Column("name", datatype.text)
test = Table("test", metadata, col_ID, col_name)
metadata.create_all(engine)
ins = test.insert()

records = [(i, "abcdefghijklmnopqrstuvwxyz") for i in range(1000)]
records = records + [(random.randint(1, 1000), "abcdefghijklmnopqrstuvwxyz") for i in range(10)]
rows = [Row(("ID", "name"), (i, "abcdefghijklmnopqrstuvwxyz")) for i in range(1000)]
rows = rows + [Row(("ID", "name"), (random.randint(1, 1000), "abcdefghijklmnopqrstuvwxyz")) for i in range(10)]

timer = Timer()

def insert_test1(): # 4.0 - 5.0 second
Пример #9
0
class SearchEngine():
    def __init__(self, schema):
        self.schema = schema
        self.database = schema.schema_name + ".db"
        self.engine = Sqlite3Engine(self.database)
        self.engine.autocommit(False)
        self.metadata = MetaData()
        self.create_all_tables()

    def commit(self):
        self.engine.commit()

    def create_all_tables(self):
        ## create the main table
        self.metadata = MetaData()
        main_table_columns = [Column(field.field_name, field.sqlite_dtype,
                                 primary_key = field.primary_key,
                                 nullable = field.nullable,
                                 default = field.default,) \
                              for field in self.schema.fields.values()]

        self.main_table = Table(self.schema.schema_name, self.metadata,
                                *main_table_columns)

        ## create keyword table
        for keyword_field in self.schema.keyword_fields:
            Table(
                keyword_field,
                self.metadata,
                Column("keyword", datatype.text, primary_key=True),
                Column("uuid_set", datatype.pythonset),
            )

        self.metadata.create_all(self.engine)

    def get_table(self, table_name):
        """根据table_name得到一个表对象
        """
        return self.metadata.tables[table_name]

    def add_one(self, document):
        """用于往数据库中添加数据, 以增量更新的模式更新索引, 性能较低
        1. 往主表格中填充一条文档
        2. 更新倒排索引里的表中的索引数据
        """
        # 将字典document转化为row
        columns, values = list(), list()
        for k, v in document.items():
            columns.append(k)
            values.append(v)
        row = Row(columns, values)

        # 更新主表的数据
        ins = self.main_table.insert()
        self.engine.insert_row(ins, row)

        # 对每一个field所对应的表进行更新
        for keyword_field in self.schema.keyword_fields:
            table = self.get_table(keyword_field)
            ins = table.insert()
            upd = table.update()

            # 对表中所涉及的keyword的行进行更新
            for keyword in document[keyword_field]:
                try:  # 尝试插入新的, 若已经存在, 则进入更新
                    self.engine.insert_row(
                        ins,
                        Row(["keyword", "uuid_set"],
                            [keyword,
                             set([document[self.schema.uuid]])]))
                except:  # 对keyword的行进行更新
                    a = self.engine.select(
                        Select([table.uuid_set
                                ]).where(table.keyword == keyword))

                    new_uuid_set = list(a)[0][0]

                    print(new_uuid_set)
                    new_uuid_set.add(document[self.schema.uuid])

                    self.engine.update(
                        upd.values(uuid_set=new_uuid_set).where(
                            table.keyword == keyword))

    def clone_from_data_stream(self, documents):
        """用于从0开始, 从批量文档中生成数据库, 性能较高
        1. 往主表格中填充一条文档
        2. 从文档中生成倒排索引的索引字典
        3. 往所有的索引表中填充索引
                
        invert_index = {keyword: set_of_uuid}
        """
        import time
        st = time.clock()
        print("正在往数据库 %s 中填充数据..." % self.schema.schema_name)

        # 初始化all_inv_dict
        all_inv_dict = dict()
        for keyword_field in self.schema.keyword_fields:  # initialize a empty dict for invert index
            all_inv_dict[keyword_field] = dict()

        ins = self.main_table.insert()
        counter = 0
        for document in documents:
            counter += 1
            # 将字典document转化为row
            columns, values = list(), list()
            for k, v in document.items():
                columns.append(k)
                values.append(v)
            row = Row(columns, values)

            # 更新主表的数据
            try:
                self.engine.insert_row(ins, row)
            except:
                pass

            # 计算倒排索引
            for keyword_field in self.schema.keyword_fields:
                uuid = document[self.schema.uuid]
                for keyword in document[keyword_field]:
                    if keyword in all_inv_dict[keyword_field]:
                        all_inv_dict[keyword_field][keyword].add(uuid)
                    else:
                        all_inv_dict[keyword_field][keyword] = set([
                            uuid,
                        ])

        # 将all_inv_dict中的数据存入索引表中
        for keyword_field in all_inv_dict:
            table = self.get_table(keyword_field)
            ins = table.insert()
            for keyword, uuid_set in all_inv_dict[keyword_field].items():
                self.engine.insert_record(ins, (keyword, uuid_set))

        print("\t数据库准备完毕, 一共插入了 %s 条数据, 可以进行搜索了! 一共耗时 %s 秒" % (
            counter,
            (time.clock() - st),
        ))

    def create_query(self):
        """生成一个Query对象, 并把引擎所绑定的Schema传给Query
        使得Query能够自行找到Schema中的各个Fields
        """
        return Query(self.schema)

    def _search(self, query):
        """根据query进行单元搜索, 返回record tuple. 本方法为search()和search_document()方法的内核
        """
        main_sqlcmd_select_uuid, main_sqlcmd_select_all, keyword_sqlcmd_list = query.create_sql(
        )

        ### 情况1, 主表和倒排索引表都要被查询
        if (len(keyword_sqlcmd_list) >= 1) and ("WHERE"
                                                in main_sqlcmd_select_uuid):
            # 得到查询主表所筛选出的 result_uuid_set
            result_uuid_set = OrderedSet([
                record[0] for record in self.engine.cursor.execute(
                    main_sqlcmd_select_uuid)
            ])

            # 得到使用倒排索引所筛选出的 keyword_uuid_set
            keyword_uuid_set = OrderedSet(
                set.intersection(*[
                    self.engine.cursor.execute(sqlcmd).fetchone()[0]
                    for sqlcmd in keyword_sqlcmd_list
                ]))
            # 对两者求交集
            result_uuid_set = OrderedSet.intersection(result_uuid_set,
                                                      keyword_uuid_set)
            # 根据结果中的uuid, 去主表中取数据
            for uuid in result_uuid_set:
                record = self.engine.cursor.execute(
                    "SELECT * FROM {0} WHERE {1} = {2}".format(
                        self.schema.schema_name,
                        self.schema.uuid,
                        repr(uuid),
                    )).fetchone()
                yield record

        ### 情况2, 只对倒排索引表查询
        elif (len(keyword_sqlcmd_list) >=
              1) and ("WHERE" not in main_sqlcmd_select_uuid):
            # 得到查询主表所筛选出的 result_uuid_set
            result_uuid_set = OrderedSet([
                record[0] for record in self.engine.cursor.execute(
                    main_sqlcmd_select_uuid)
            ])

            # 得到使用倒排索引所筛选出的 keyword_uuid_set
            keyword_uuid_set = OrderedSet(
                set.intersection(*[
                    self.engine.cursor.execute(sqlcmd).fetchone()[0]
                    for sqlcmd in keyword_sqlcmd_list
                ]))
            # 对两者求交集
            result_uuid_set = OrderedSet.intersection(result_uuid_set,
                                                      keyword_uuid_set)
            # 根据结果中的uuid, 去主表中取数据
            for uuid in result_uuid_set:
                record = self.engine.cursor.execute(
                    "SELECT * FROM {0} WHERE {1} = {2}".format(
                        self.schema.schema_name,
                        self.schema.uuid,
                        repr(uuid),
                    )).fetchone()
                yield record

        ### 情况3, 只对主表查询
        elif (len(keyword_sqlcmd_list) == 0) and ("WHERE"
                                                  in main_sqlcmd_select_uuid):
            for record in self.engine.cursor.execute(main_sqlcmd_select_all):
                yield record

        ### 情况4, 空查询
        else:
            pass

    def search(self, query):
        """根据query进行单元搜索, 返回record tuple
        """
        counter = 0
        for record in self._search(query):
            counter += 1
            if counter <= query.limit_number:
                yield record
            else:
                return

    def search_document(self, query):
        """根据query进行单元搜索, 返回document ordereddict
        example: OrderedDict({field_name: field_value})
        """
        counter = 0
        for record in self.search(query):
            counter += 1
            document = OrderedDict()
            # pack up as a ordered dict
            for field_name, field, value in zip(self.schema.fields.keys(),
                                                self.schema.fields.values(),
                                                record):
                document[field_name] = value
            if counter <= query.limit_number:
                yield document
            else:
                return

    ### =================== 语法糖方法 =========================
    def _get_all_valid_keyword(self, field_name):
        """私有函数, 用于支持Engine.display_valid_keyword, Engine.search_valid_keyword功能
        根据field_name得到该field下所有出现过的keyword
        """
        if field_name in self.schema.keyword_fields:
            all_keywords = [
                row[0]
                for row in self.engine.execute("SELECT keyword FROM %s" %
                                               field_name)
            ]
            return all_keywords
        else:
            raise Exception("ERROR! field_name has to be in %s, yours is %s" %
                            (self.keyword_fields, field_name))

    def _search_valid_keyword(self, field_name, pattern):
        """根据pattern, 搜索在单元名为field_name中可供选择的keyword"""
        result = [
            keyword for keyword in self._get_all_valid_keyword(field_name)
            if pattern in keyword
        ]
        return result

    ### ================== Help ====================
    def print_as_table(self, array):
        for chunk in grouper(array, 5, ""):
            print(
                "\t{0[0]:<20}\t{0[1]:<20}\t{0[2]:<20}\t{0[3]:<20}\t{0[4]:<20}".
                format(chunk))

    def display_searchable_fields(self):
        """打印所有能被搜索到的单元名和具体类定义"""
        print("\n{:=^100}".format("All searchable fields"))
        for field_name, field in self.schema.fields.items():
            print("\t%s <---> %s" % (field_name, repr(field)))

    def display_keyword_fields(self):
        """打印所有支持倒排索引的单元名和具体类定义"""
        print("\n{:=^100}".format("All keyword fields"))
        for field_name, field in self.schema.fields.items():
            if "_Searchable_KEYWORD" in field.search_types:
                print("\t%s <---> %s" % (field_name, repr(field)))

    def display_criterion(self):
        """打印所有引擎支持的筛选条件"""
        query = self.create_query()
        print("\n{:=^100}".format("All supported criterion"))
        print("\t%s" % query.query_equal.help())
        print("\t%s" % query.query_greater.help())
        print("\t%s" % query.query_smaller.help())
        print("\t%s" % query.query_between.help())
        print("\t%s" % query.query_startwith.help())
        print("\t%s" % query.query_endwith.help())
        print("\t%s" % query.query_like.help())
        print("\t%s" % query.query_contains.help())

    def display_valid_keyword(self, field_name):
        """打印某个单元下所有有效的keyword的集合"""
        print("\n{:=^100}".format("All valid keyword in %s" % field_name))
        if field_name in self.schema.keyword_fields:
            all_keywords = self._get_all_valid_keyword(field_name)
            all_keywords.sort()
            self.print_as_table(all_keywords)
            print("Found %s valid keywords with in %s" %
                  (len(all_keywords), field_name))
        else:
            print("ERROR! field_name has to be in %s, yours is %s" %
                  (self.schema.keyword_fields, field_name))

    def search_valid_keyword(self, field_name, pattern):
        """根据pattern, 打印在单元名为field_name中可供选择的keyword"""
        print("\n{:=^100}".format("All valid keyword with pattern %s in %s" %
                                  (pattern, field_name)))
        result = self._search_valid_keyword(field_name, pattern)
        result.sort()
        self.print_as_table(result)
        print("Found %s valid keywords with pattern %s in %s" %
              (len(result), pattern, field_name))
        return result

    def help(self):
        """print help information"""
        text = \
        """
        Use the following command to help you create desired query:
        \tSearchEngine.display_searchable_fields()
        \tSearchEngine.display_keyword_fields()
        \tSearchEngine.display_criterion()
        \tSearchEngine.display_valid_keyword(field_name)
        \tSearchEngine.search_valid_keyword(field_name, pattern)
        """
        print(text)
Пример #10
0
class SearchEngine():
    def __init__(self, schema):
        self.schema = schema
        self.database = schema.schema_name + ".db"
        self.engine = Sqlite3Engine(self.database)
        self.engine.autocommit(False)
        self.metadata = MetaData()
        self.create_all_tables()
    
    def commit(self):
        self.engine.commit()
    
    def create_all_tables(self):
        ## create the main table
        self.metadata = MetaData()
        main_table_columns = [Column(field.field_name, field.sqlite_dtype,
                                 primary_key = field.primary_key,
                                 nullable = field.nullable,
                                 default = field.default,) \
                              for field in self.schema.fields.values()]
        
        self.main_table = Table(self.schema.schema_name, self.metadata, *main_table_columns)
        
        ## create keyword table
        for keyword_field in self.schema.keyword_fields:
            Table(keyword_field, self.metadata,
                  Column("keyword", datatype.text, primary_key = True),
                  Column("uuid_set", datatype.pythonset),
                  )
        
        self.metadata.create_all(self.engine)
        
    def get_table(self, table_name):
        """根据table_name得到一个表对象
        """
        return self.metadata.tables[table_name]
    
    def add_one(self, document):
        """用于往数据库中添加数据, 以增量更新的模式更新索引, 性能较低
        1. 往主表格中填充一条文档
        2. 更新倒排索引里的表中的索引数据
        """
        # 将字典document转化为row
        columns, values = list(), list()
        for k, v in document.items():
            columns.append(k)
            values.append(v)
        row = Row(columns, values)
        
        # 更新主表的数据
        ins = self.main_table.insert()
        self.engine.insert_row(ins, row)
        
        # 对每一个field所对应的表进行更新
        for keyword_field in self.schema.keyword_fields:
            table = self.get_table(keyword_field)
            ins = table.insert()
            upd = table.update()
              
            # 对表中所涉及的keyword的行进行更新
            for keyword in document[keyword_field]:
                try: # 尝试插入新的, 若已经存在, 则进入更新
                    self.engine.insert_row(ins, Row(
                                                    ["keyword", "uuid_set"], 
                                                    [keyword, set([document[self.schema.uuid]])]
                                                     ) )
                except: # 对keyword的行进行更新
                    a = self.engine.select(Select([table.uuid_set]).where(table.keyword == keyword))
                    
                    new_uuid_set = list(a)[0][0]
                    
                    print(new_uuid_set)
                    new_uuid_set.add(document[self.schema.uuid])
                      
                    self.engine.update(upd.values(uuid_set = new_uuid_set).where(table.keyword == keyword) )
                    
    def clone_from_data_stream(self, documents):
        """用于从0开始, 从批量文档中生成数据库, 性能较高
        1. 往主表格中填充一条文档
        2. 从文档中生成倒排索引的索引字典
        3. 往所有的索引表中填充索引
                
        invert_index = {keyword: set_of_uuid}
        """
        import time
        st = time.clock()
        print("正在往数据库 %s 中填充数据..." % self.schema.schema_name)
        
        # 初始化all_inv_dict
        all_inv_dict = dict()
        for keyword_field in self.schema.keyword_fields: # initialize a empty dict for invert index
            all_inv_dict[keyword_field] = dict()
            
        ins = self.main_table.insert()
        counter = 0
        for document in documents:
            counter += 1
            # 将字典document转化为row
            columns, values = list(), list()
            for k, v in document.items():
                columns.append(k)
                values.append(v)
            row = Row(columns, values)
        
            # 更新主表的数据
            try:
                self.engine.insert_row(ins, row)
            except:
                pass
            
            # 计算倒排索引
            for keyword_field in self.schema.keyword_fields:
                uuid = document[self.schema.uuid]
                for keyword in document[keyword_field]:
                    if keyword in all_inv_dict[keyword_field]:
                        all_inv_dict[keyword_field][keyword].add(uuid)
                    else:
                        all_inv_dict[keyword_field][keyword] = set([uuid,])
        
        # 将all_inv_dict中的数据存入索引表中
        for keyword_field in all_inv_dict:
            table = self.get_table(keyword_field)
            ins = table.insert()
            for keyword, uuid_set in all_inv_dict[keyword_field].items():
                self.engine.insert_record(ins, (keyword, uuid_set))
        
        print("\t数据库准备完毕, 一共插入了 %s 条数据, 可以进行搜索了! 一共耗时 %s 秒" % (counter,
                                                               (time.clock() - st), ) )
        

    def create_query(self):
        """生成一个Query对象, 并把引擎所绑定的Schema传给Query
        使得Query能够自行找到Schema中的各个Fields
        """
        return Query(self.schema)
    
    
    def _search(self, query):
        """根据query进行单元搜索, 返回record tuple. 本方法为search()和search_document()方法的内核
        """
        main_sqlcmd_select_uuid, main_sqlcmd_select_all, keyword_sqlcmd_list = query.create_sql()

        ### 情况1, 主表和倒排索引表都要被查询
        if (len(keyword_sqlcmd_list) >= 1) and ("WHERE" in main_sqlcmd_select_uuid):
            # 得到查询主表所筛选出的 result_uuid_set
            result_uuid_set = OrderedSet([record[0] for record in self.engine.cursor.execute(main_sqlcmd_select_uuid)])

            # 得到使用倒排索引所筛选出的 keyword_uuid_set
            keyword_uuid_set = OrderedSet( set.intersection(
                *[self.engine.cursor.execute(sqlcmd).fetchone()[0] for sqlcmd in keyword_sqlcmd_list]
                ) )
            # 对两者求交集
            result_uuid_set = OrderedSet.intersection(result_uuid_set, keyword_uuid_set)
            # 根据结果中的uuid, 去主表中取数据
            for uuid in result_uuid_set:
                record = self.engine.cursor.execute("SELECT * FROM {0} WHERE {1} = {2}".format(self.schema.schema_name,
                                                                                               self.schema.uuid,
                                                                                               repr(uuid),) ).fetchone()
                yield record

        ### 情况2, 只对倒排索引表查询
        elif (len(keyword_sqlcmd_list) >= 1) and ("WHERE" not in main_sqlcmd_select_uuid):
            # 得到查询主表所筛选出的 result_uuid_set
            result_uuid_set = OrderedSet([record[0] for record in self.engine.cursor.execute(main_sqlcmd_select_uuid)])

            # 得到使用倒排索引所筛选出的 keyword_uuid_set
            keyword_uuid_set = OrderedSet( set.intersection(
                *[self.engine.cursor.execute(sqlcmd).fetchone()[0] for sqlcmd in keyword_sqlcmd_list]
                ) )
            # 对两者求交集
            result_uuid_set = OrderedSet.intersection(result_uuid_set, keyword_uuid_set)
            # 根据结果中的uuid, 去主表中取数据
            for uuid in result_uuid_set:
                record = self.engine.cursor.execute("SELECT * FROM {0} WHERE {1} = {2}".format(self.schema.schema_name,
                                                                                               self.schema.uuid,
                                                                                               repr(uuid),) ).fetchone()
                yield record
        
        ### 情况3, 只对主表查询
        elif (len(keyword_sqlcmd_list) == 0) and ("WHERE" in main_sqlcmd_select_uuid):
            for record in self.engine.cursor.execute(main_sqlcmd_select_all):
                yield record
        
        ### 情况4, 空查询
        else:
            pass
    
    def search(self, query):
        """根据query进行单元搜索, 返回record tuple
        """
        counter = 0
        for record in self._search(query):
            counter += 1
            if counter <= query.limit_number:
                yield record
            else:
                return
            
    def search_document(self, query):
        """根据query进行单元搜索, 返回document ordereddict
        example: OrderedDict({field_name: field_value})
        """
        counter = 0
        for record in self.search(query):
            counter += 1
            document = OrderedDict()
            # pack up as a ordered dict
            for field_name, field, value in zip(self.schema.fields.keys(), 
                                                self.schema.fields.values(), 
                                                record):    
                document[field_name] = value
            if counter <= query.limit_number:
                yield document
            else:
                return
            
    ### =================== 语法糖方法 =========================
    def _get_all_valid_keyword(self, field_name):
        """私有函数, 用于支持Engine.display_valid_keyword, Engine.search_valid_keyword功能
        根据field_name得到该field下所有出现过的keyword
        """
        if field_name in self.schema.keyword_fields:
            all_keywords = [row[0] for row in self.engine.execute("SELECT keyword FROM %s" % field_name)]
            return all_keywords
        else:
            raise Exception("ERROR! field_name has to be in %s, yours is %s" % (self.keyword_fields, 
                                                                                field_name))

    def _search_valid_keyword(self, field_name, pattern):
        """根据pattern, 搜索在单元名为field_name中可供选择的keyword"""
        result = [keyword for keyword in self._get_all_valid_keyword(field_name) if pattern in keyword]
        return result

    ### ================== Help ====================
    def print_as_table(self, array):
        for chunk in grouper(array, 5, ""):
            print("\t{0[0]:<20}\t{0[1]:<20}\t{0[2]:<20}\t{0[3]:<20}\t{0[4]:<20}".format(chunk) )
            
    def display_searchable_fields(self):
        """打印所有能被搜索到的单元名和具体类定义"""
        print("\n{:=^100}".format("All searchable fields"))
        for field_name, field in self.schema.fields.items():
            print("\t%s <---> %s" % (field_name, repr(field) ) )

    def display_keyword_fields(self):
        """打印所有支持倒排索引的单元名和具体类定义"""
        print("\n{:=^100}".format("All keyword fields"))
        for field_name, field in self.schema.fields.items():
            if "_Searchable_KEYWORD" in field.search_types:
                print("\t%s <---> %s" % (field_name, repr(field) ) )

    def display_criterion(self):
        """打印所有引擎支持的筛选条件"""
        query = self.create_query()
        print("\n{:=^100}".format("All supported criterion"))
        print("\t%s" % query.query_equal.help())
        print("\t%s" % query.query_greater.help())
        print("\t%s" % query.query_smaller.help())
        print("\t%s" % query.query_between.help())
        print("\t%s" % query.query_startwith.help())
        print("\t%s" % query.query_endwith.help())
        print("\t%s" % query.query_like.help())
        print("\t%s" % query.query_contains.help())

    def display_valid_keyword(self, field_name):
        """打印某个单元下所有有效的keyword的集合"""
        print("\n{:=^100}".format("All valid keyword in %s" % field_name))
        if field_name in self.schema.keyword_fields:
            all_keywords = self._get_all_valid_keyword(field_name)
            all_keywords.sort()
            self.print_as_table(all_keywords)
            print("Found %s valid keywords with in %s" % (len(all_keywords), 
                                                                     field_name) )
        else:
            print("ERROR! field_name has to be in %s, yours is %s" % (self.schema.keyword_fields, 
                                                                      field_name) )

    def search_valid_keyword(self, field_name, pattern):
        """根据pattern, 打印在单元名为field_name中可供选择的keyword"""
        print("\n{:=^100}".format("All valid keyword with pattern %s in %s" % (pattern,
                                                                               field_name) ) )
        result = self._search_valid_keyword(field_name, pattern)
        result.sort()
        self.print_as_table(result)
        print("Found %s valid keywords with pattern %s in %s" % (len(result), 
                                                                 pattern, 
                                                                 field_name))
        return result
    
    def help(self):
        """print help information"""
        text = \
        """
        Use the following command to help you create desired query:
        \tSearchEngine.display_searchable_fields()
        \tSearchEngine.display_keyword_fields()
        \tSearchEngine.display_criterion()
        \tSearchEngine.display_valid_keyword(field_name)
        \tSearchEngine.search_valid_keyword(field_name, pattern)
        """
        print(text)
Пример #11
0
    def _read_metadata(self):
        """construct the metadata for creating the database table
        """
        self.metadata = MetaData()
        datatype = DataType()

        ### map the CSV.dtype definition to pandas.read_csv dtype and sqlite3 dtype
        _pd_dtype_mapping = {
            "TEXT": np.str,
            "INTEGER": np.int64,
            "REAL": np.float64,
            "DATE": np.str,
            "DATETIME": np.str
        }
        _db_dtype_mapping = {
            "TEXT": datatype.text,
            "INTEGER": datatype.integer,
            "REAL": datatype.real,
            "DATE": datatype.date,
            "DATETIME": datatype.datetime
        }

        pd_dtype = dict(
        )  # {"column_name": dtype} for part of columns, other columns using default setting
        db_dtype = dict()  # {"column_name": dtype} for all columns
        for column_name, data_type in self.dtype.items():
            if data_type in _pd_dtype_mapping:
                pd_dtype[column_name] = _pd_dtype_mapping[data_type]
            if data_type in _db_dtype_mapping:
                db_dtype[column_name] = _db_dtype_mapping[data_type]

        ### Read one row, and extract column information from csv
        if self.usecols:
            df = pd.read_csv(self.path,
                             sep=self.sep,
                             header=self.header,
                             nrows=1,
                             dtype=pd_dtype,
                             usecols=self.usecols)
        else:
            df = pd.read_csv(self.path,
                             sep=self.sep,
                             header=self.header,
                             nrows=1,
                             dtype=pd_dtype)

        # 强行转化为字符串, 却表列index = 数据表中的列名, 且为合法字符串
        new_columns = list()
        for i in df.columns:
            if not isinstance(i, str):
                new_columns.append("c" + str(i))
            else:
                new_columns.append(i)
        df.columns = new_columns

        ### Define the right data type in database for each column
        for column_name, data_type in zip(df.columns, df.dtypes):
            if column_name not in db_dtype:
                if data_type in [
                        np.object,
                ]:
                    db_dtype.setdefault(column_name, datatype.text)
                elif data_type in [
                        np.int64, np.int32, np.int16, np.int8, np.int0, np.int
                ]:
                    db_dtype.setdefault(column_name, datatype.integer)
                elif data_type in [
                        np.float64, np.float32, np.float16, np.float
                ]:
                    db_dtype.setdefault(column_name, datatype.real)

        self.pd_dtype = pd_dtype
        self.db_dtype = db_dtype

        ### Construct Database.Table Metadata
        columns = list()
        for column_name, data_type in zip(df.columns, df.dtypes):
            if column_name in self.primary_key_columns:
                primary_key_flag = True
            else:
                primary_key_flag = False
            columns.append(
                Column(column_name,
                       db_dtype[column_name],
                       primary_key=primary_key_flag))

        Table(self.table_name, self.metadata, *columns)
        self.table = self.metadata.tables[self.table_name]
Пример #12
0
from collections import OrderedDict
import datetime


class MyClass():
    """a user customized class stored with document
    """
    def __init__(self, value):
        self.value = value

    def __repr__(self):
        return "MyClass(%s)" % repr(self.value)


engine = Sqlite3Engine(":memory:")
metadata = MetaData()
datatype = DataType()

pythontype = Table(
    "pythontype",
    metadata,
    Column("uuid", datatype.integer, primary_key=True),
    Column("date_type", datatype.date, default=datetime.date(1999, 1, 1)),
    Column("datetime_type",
           datatype.datetime,
           default=datetime.datetime(2000, 1, 1, 0, 0, 0)),
    Column("list_type", datatype.pythonlist, default=list()),
    Column("set_type", datatype.pythonset, default=set()),
    Column("dict_type", datatype.pythondict, default=dict()),
    Column("ordereddict_type", datatype.ordereddict, default=OrderedDict()),
    Column("strset_type", datatype.strset, default=StrSet(["a", "b", "c"])),