Пример #1
0
class BaseSaver(object):
    # 文档字符串
    '''
    BaseSaver class allows users to save all infos data fetched from website.

    :Usage:

    '''
    # 数据存储器的静态成员定义
    SAVE_MODES = ('mongodb', 'neo4j', 'mysql')

    # 初始化方法:
    def __init__(self, save_mode="neo4j"):
        # 文档字符串
        '''
        Initialize an instance of BaseSaver.

        :Args:
         - save_mode : a str of database to save data in.

        '''
        # 方法实现
        if save_mode not in self.SAVE_MODES:
            raise RuntimeError('存储模式指定有误,请输入mongodb、neo4j或者mysql')
        self.save_mode = save_mode
        if self.save_mode == 'mongodb':
            # mongodb initialize
            print('>>>> we are in mongodb.')
            self.connector = MongoClient(
                **MONGO_CONF)[MONGO_CONF.get('authSource')]
        elif self.save_mode == 'neo4j':
            # neo4j initialize
            print('>>>> we are in neo4j.')
            self.connector = Graph(**NEO_CONF)
        else:
            # mysql initialize
            print('>>>> we are in mysql.')
            self.connector = pymysql.connect(**SQL_CONF)
            self.cursor = self.connector.cursor()
            sql = RESORT_SQL.format(table_name)
            print(sql)
            self.cursor.execute(sql)
            self.connector.commit()

    # 数据存储方法:
    def data_save(self, file_name):
        # 文档字符串
        '''
        Saves spider fetched data into different databases.
        Wipes out the old data and saves the new fetched ones.

        :Args:
         - file_name : a str of file name to fetch data from.

        '''
        # 方法实现
        # 此处可以拓展成任意文件类型,其他文件类型的数据转换成json再写即可
        file_path = os.path.join(save_path, file_name + '.json')
        if not os.access(file_path, os.F_OK):
            raise RuntimeError(f'数据文件{file_path}不存在,请检查数据!')
        with open(file_path, 'r', encoding='utf-8') as file:
            self.json_data = json.load(file, encoding='utf-8')

        if self.save_mode == 'mongodb':
            print('>>> we are saving to mongodb.')
            # 删除原始数据
            self.connector.drop_collection(collection)
            # 保存新数据
            self.connector[collection].insert_many(self.json_data)
        elif self.save_mode == 'neo4j':
            print('>>> we are saving to neo4j.')
            # 删除原始数据, 一定要小心使用
            self.graph_cleaner()
            # 保存新数据
            self.graph_builder()
        else:
            print('>>> we are saving to mysql.')
            # 删除原始数据,一定要小心使用
            self.cursor.execute(f"DELETE FROM {table_name}")
            # 准备sql语句
            data_key = self.json_data[0].keys()
            sql_key = ','.join(data_key)
            sql_value = ', '.join([f'%({key})s' for key in data_key])
            # 保存新数据
            sql = '''
            INSERT INTO {0}({1})
            VALUES ({2});
            '''.format(table_name, sql_key, sql_value)
            print(sql)
            self.cursor.executemany(sql, self.json_data)
            self.connector.commit()

    # 知识图谱删除方法:
    def graph_cleaner(self):
        pass

    # 知识图谱生成方法:
    def graph_builder(self):
        pass

    # 数据存储器退出方法:
    def __del__(self):
        # 文档字符串
        '''
        The deconstructor of BaseSaver class.

        Deconstructs an instance of BaseSaver, closes Databases.
        '''
        # 方法实现
        print(f'>>>> closing {self.save_mode}.')
        if self.save_mode == 'mongodb':
            self.connector.client.close()
        elif self.save_mode == 'mysql':
            self.connector.close()