Пример #1
0
class ChinaJournalDatabase:
    """ ChinaJournalDatabase类用来连接中文文献数据库

    """
    def __init__(self,proxy=None):
        self.mongo = MongoDB()
        self.mongo.connect('publication','ChineseJournal')
        self.collection = self.mongo.collection

    def getByName(self,journal_name=None,exactly=False,auto=False):
        if auto:
            result = list(self.find(condition={'中文名称':journal_name}))
            if len(list(result)) < 1:
                result = list(self.find(condition={'中文名称':{'$regex':journal_name}}))
            return result
        else:
            if exactly:
                return list(self.find(condition={'中文名称':journal_name}))
            else:
                return list(self.find(condition={'中文名称':{'$regex':journal_name}}))

    def find(self,condition=None):
        """ 查询

        :param dict condition: 查询条件
        :return: 查询结果
        """
        return self.collection.find(condition)

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.mongo.close()
Пример #2
0
class TagDB:
    """ TagDB类连接tagdb数据库

    """
    def __init__(self):
        # 初始化数据库集合
        self.__tag_db = MongoDB()
        self.__tag_db.connect('FileIndex', 'tagdb')
        self.collection = self.__tag_db.collection

    def update(self,tag):
        """ 更新tagdb数据库

        :param dict tag: tag字典
        :return: 数据库集合tagdb中查询得到的tagid
        :rtype: bson.objectid.ObjectId对象
        """
        tag_found = self.collection.find_one({'tag':tag['tag']})
        if tag_found is not None:
            tagid = tag_found['_id']
            tag_found.pop('_id')
            tag_found['files'] = '|'.join([str(item) for item in tag_found['files']])
            tag['files'] = '|'.join([str(item) for item in tag['files']])
            difference = dict(list(tag.items() - tag_found.items()))
            if len(difference) > 0:
                self.collection.find_one_and_update({'_id':tagid}, {'$set':{'files':tag['files']}})
                                                    #{'$set':{'files':[ObjectId(item) for item in re.split('\|',difference['files'])]}})
            return tagid
        else:
            self.collection.insert_one(tag)
            return None

    def delete_many(self,ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({'_id':ObjectId(item)})

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__tag_db.close()
Пример #3
0
class ElisePaper:
    def __init__(self):
        self.mongo = MongoDB()
        province_economy = Excel('E:\\data\\college\\province.xlsx').read()
        self.province_economy_dict = dict([(str(int(item[0])),item[2]) for item in province_economy[1:]])
        self.mongo.connect('region','province')
        self.province_dict = dict([(item['acode'],item['region']) for item in self.mongo.collection.find({})])

    def get_data_by_region(self,acode,subject='理科'):
        mresult = [['学校','类型','录取平均分','学校地区','学校评分','地区收入']]
        self.mongo.connect('college','entranceexam')
        query_result = self.mongo.collection.find({'student_region':acode,'subject':subject})
        self.mongo.connect('college','collegeinfo')
        for item in query_result:
            rating = self.mongo.collection.find_one({'name':item['university']})['rating']
            income = self.province_economy_dict[item['university_region']]
            mresult.append([item['university'],item['type'],item['average_score'],item['university_region'],
                            rating,income])
        return mresult

    def get_college_data(self):
        self.mongo.connect('college','entranceexam')
        colleges = self.mongo.collection.find({}).distinct('university')
        self.mongo.connect('college','collegeinfo')
        records = self.mongo.collection.find({'name':{'$in':colleges}},
                                             projection={'_id':0,'name':1,'rating':1,'score':1,'region':1,'project':1,'type':1})

        result = [record for record in records if len(record['region']) < 2]
        result = dict([(item['rating'],item) for item in result])
        mresult = [['排名','学校','类型','评分','地区','地区可支配收入']]
        for item in sorted(result):
            region = result[item]['region'][0]
            region_income = self.province_economy_dict[region]
            mresult.append([result[item]['rating'],result[item]['name'],
                            result[item]['type'],result[item]['score'],
                            region,region_income])
        return mresult


    def close(self):
        self.mongo.close()
Пример #4
0
class FileDB:
    """ FileDB类连接filedb数据库

    """

    def __init__(self):
        # 初始化数据库集合
        self.__file_db = MongoDB()
        self.__file_db.connect("FileIndex", "filedb")
        self.collection = self.__file_db.collection
        # pathdb数据集合
        self.__path_db = PathDB()

    def update(self, file, path):
        """ 根据file,更新文件库信息

        :param file: 文件对象
        :param path: 路径对象
        :return: 返回当前文件在数据库中的id,如果数据库中无当前文件信息,返回None
        :rtype: bson.objectid.ObjectId对象
        """
        # 获得数据库中该文件的路径信息
        path_found = self.__path_db.collection.find_one({"path": path.relative_path})
        if path_found is None:
            print("Path wrong!")
            raise FileNotFoundError
        # 获得文件的信息
        file_found = self.collection.find_one(
            {
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                )
            }
        )
        # 如果数据库里没有相关文件信息,则插入此文件信息;若有,比较两者是否一致。
        if file_found is not None:
            # 变量fid是数据库中相关文件信息中的_id
            fid = file_found["_id"]
            # 目录中文件信息与数据库相关文件信息的差异
            difference = dict(
                list(
                    self.make_document(file, path, path_found, True).items()
                    - self.make_document_from_db(file_found).items()
                )
            )
            # 更改tags和projects的格式
            if "tags" in difference:
                difference["tags"] = re.split("\|", difference["tags"])
            if "projects" in difference:
                difference["projects"] = re.split("\|", difference["projects"])
            if "last_modified" in difference:
                difference["last_modified"] = file.parser.last_modified
            if "time" in difference:
                difference["time"] = file.parser.time
            # 若存在差异,则更新数据库中的信息
            if len(difference) > 0:
                self.collection.insert_one(self.make_document(file, path, path_found, False))
                return None
            else:
                # self.collection.find_one_and_update({'_id':fid},{'$set':difference})
                return fid
        else:
            # 若数据库中无此文件信息,那么插入此信息
            self.collection.insert_one(self.make_document(file, path, path_found, False))
            return None

    @classmethod
    def make_document(cls, file, path, path_found, for_comparison=False):
        """ 根据file,path以及数据库中查询得到的path_found,创建符合数据库filedb集合标准格式的文档

        :param File file: 文件对象
        :param Path path: 路径对象
        :param dict path_found: 数据库中查询得到的路径文档
        :param bool for_comparison: 是否是用来进行比较
        :return: 数据文档
        :rtype: dict
        """
        if for_comparison:
            document = {
                "full_file_name": os.path.join(path.relative_path, file.parser.path_name),
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                ),
                "special_characters": file.parser.special_character_part,
                "file_name": file.parser.path_name,
                "directory": path_found["_id"],
                "extension": file.parser.extension,
                "last_modified": file.parser.last_modified.ctime(),
                "size": len(file),
                "author": file.parser.author,
                "version": file.parser.version,
            }
            if file.parser.time is not None:
                document["time"] = file.parser.time.ctime()
            else:
                document["time"] = None
            if file.parser.tags is not None:
                document["tags"] = "|".join(file.parser.tags)
            else:
                document["tags"] = None

            if file.parser.projects is not None:
                document["projects"] = "|".join(file.parser.projects)
            else:
                document["projects"] = None
        else:
            document = {
                "full_file_name": os.path.join(path.relative_path, file.parser.path_name),
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                ),
                "special_characters": file.parser.special_character_part,
                "file_name": file.parser.path_name,
                "directory": path_found["_id"],
                "extension": file.parser.extension,
                "last_modified": file.parser.last_modified,
                "size": len(file),
                "author": file.parser.author,
                "time": file.parser.time,
                "version": file.parser.version,
                "tags": file.parser.tags,
                "projects": file.parser.projects,
            }

        return document

    @classmethod
    def make_document_from_db(cls, file):
        """ 根据数据库中的文件文档对象创建新文档,进行比较

        :param dict file: 文件对象
        :return: 文档
        :rtype: dict
        """
        record = file
        record.pop("_id")
        record["last_modified"] = record["last_modified"].ctime()

        if record["time"] is not None:
            record["time"] = record["time"].ctime()
        else:
            record["time"] = None

        if record["tags"] is not None:
            record["tags"] = "|".join(record["tags"])
        else:
            record["tags"] = None

        if record["projects"] is not None:
            record["projects"] = "|".join(record["projects"])
        else:
            record["projects"] = None
        return record

    def delete_many(self, ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({"_id": ObjectId(item)})

    def make_tag_document(self):
        """ 根据filedb数据库中的信息,生成tagdb中所需要的标准格式文档

        :return: 标签文档
        :rtype: defaultdict对象
        """
        tags = defaultdict(list)
        tag_items = self.collection.find({}, {"_id": 1, "tags": 1})
        for item in tag_items:
            for tag in item["tags"]:
                tags[tag].append(item["_id"])
        return tags

    def get_files_according_to_path_list(self, path_list):
        """ 根据目录列表,补充文件信息

        :param list path_list: 目录列表
        :return: 完整的目录文件字典
        :rtype: OrderedDict对象
        """
        result = OrderedDict()
        for path in path_list:
            path_found = self.__path_db.collection.find_one({"path": path})
            files_found = self.collection.find({"directory": path_found["_id"]})
            result[path] = [file["file_name"] for file in files_found]
        return result

    def find_and_open(self, base_path, temp_path, **condition):
        """ 根据condition条件查询filedb数据库,复制查询得到的文件到temp_path,并且打开temp_path文件窗口

        :param str base_path:
        :param str temp_path:
        :param dict condition:
        :return: 返回成功或失败信息
        :rtype: str
        """
        result = list(self.collection.find(condition))
        if len(result) < 1:
            return "None is found!"
        else:
            for item in result:
                source_file = os.path.join(base_path, item["full_file_name"])
                destination_file = os.path.join(temp_path, item["file_name"])
                OSOperator.copy_to(source_file, destination_file)
            os.startfile(temp_path)
            return "successfully!"

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__file_db.close()
Пример #5
0
class PathDB:
    """ PathDB类连接pathdb数据库

    """
    def __init__(self):
        # 初始化数据库集合
        self.__path_db = MongoDB()
        self.__path_db.connect('FileIndex', 'pathdb')
        self.collection = self.__path_db.collection

    def update(self,path):
        """ 更新路径

        :param Path path: Path类对象,提供路径信息
        :return: 数据库匹配得到的路径文档的_id,若数据库中不存在此路径,则返回None
        :rtype: ObjectId类对象或者None
        """
        # 根据相对路径寻找数据库中匹配的文档
        document_in_db = self.collection.find_one({'path':path.relative_path})
        # 若没有找到匹配的文档,则插入当前路径信息文档
        if document_in_db is None:
            self.collection.insert_one(self.make_document(path,False))
            return None
        else:
            # 更新数据库路径文档
            did = document_in_db['_id']
            document_in_db.pop('_id')
            document_in_db.pop('children_id')
            document_in_db['last_modified'] = document_in_db['last_modified'].ctime()

            difference = dict(list(self.make_document(path,True).items() - document_in_db.items()))
            if 'last_modified' in difference:
                difference['last_modified'] = path.parser.last_modified
            if len(difference) > 0:
                self.collection.find_one_and_update({'_id':did},{'$set':difference})

            return did

    def traverse_and_update(self):
        """ 遍历更新路径文档的子目录信息,即childre_id。
        这里分两个步骤,其一是遍历数据库,通过原文档的父路径id,查询父路径文档,并把源文档的id添加到父路径文档的children_id中;
        其二是遍历数据库,通过源文档的子路径id列表,查询子路径文档,并验证该子路径是否存在

        :return: 无返回值
        """
        # 遍历数据库
        for record in self.collection.find({}):
            # 获得单个记录的父路径id
            record_parent_path_id = record['parent_path_id']
            if record_parent_path_id is not None:
                # 查询父路径id指向的记录,则指向的记录是当前记录的父路径记录,那么该父路径的子路径集中必须有当前记录信息
                children_id = set(self.collection.find_one({'_id':record_parent_path_id})['children_id'])
                if record['_id'] not in children_id:
                    self.collection.update_one({'_id':record_parent_path_id},
                                               {'$addToSet':{'children_id':record.get('_id')}},upsert =True)

        # 遍历数据库
        for record in self.collection.find({}):
            # 如果存在子目录集合,那么查询每个子目录是否存在
            if len(record['children_id']) > 0:
                for pid in record['children_id']:
                    path_found = self.collection.find_one({'_id':pid})
                    if path_found is None:
                        self.collection.update_one({'_id':record['_id']},
                                                   {'$pull':{'children_id':pid}})
                        continue
                    if path_found['parent_path_id'] != record['_id']:
                        self.collection.update_one({'_id':record['_id']},
                                                   {'$pull':{'children_id':pid}})

    def make_document(self,path,for_comparison=False):
        """ 根据path创建标准格式的数据库pathdb集合中的文档

        :param path:
        :param for_comparision:
        :return:
        """
        # 数据库中文档形式如下:
        # {
        #    path: 相对路径
        #    path_name: 当前路径名称
        #    parent_path_id: 相对父路径
        #    last_modified: 最近修改时间
        #    children_id: 子目录列表
        # }
        if for_comparison:
            # 根目录是.,如果是根目录,需要设置parent_path_id为None
            if re.match('^\.$',path.relative_path) is not None:
                document = {'path':path.relative_path,
                            'path_name':path.relative_path,
                            'parent_path_id':None,
                            'last_modified':path.parser.last_modified.ctime()}
            else:
                parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id']
                document = {'path':path.relative_path,
                            'path_name': path.current_path,
                            'parent_path_id':parent_id,
                            'last_modified':path.parser.last_modified.ctime()}
        else:
            # 根目录是.,如果是根目录,需要设置parent_path_id为None
            if re.match('^\.$',path.relative_path) is not None:
                document = {'path':path.relative_path,
                            'path_name':path.relative_path,
                            'parent_path_id':None,
                            'last_modified':path.parser.last_modified,
                            'children_id':[]}
            else:
                parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id']
                document = {'path':path.relative_path,
                            'path_name': path.current_path,
                            'parent_path_id':parent_id,
                            'last_modified':path.parser.last_modified,
                            'children_id':[]}

        return document

    def _raw_path_tree(self,root='.'):
        """ 辅助函数,根据数据库中的信息,返回目录树

        :param str root: 起始路径
        :return: 路径树
        :rtype: list
        """
        result = []
        # 查询起始路径
        root_path = self.collection.find_one({'path':root})
        # 查询子路径集合
        child_path = list(self.collection.find({'parent_path_id':root_path['_id']}))
        # 如果没有子路径,则返回当前路径的集合
        if len(child_path) < 1:
            return [root_path['path']]
        # 否则,添加子路径及以子路径为起始路径的下属路径
        else:
            for item in child_path:
                result.append(item['path'])
                result.extend(self._raw_path_tree(item['path']))
            return result

    def path_tree(self,root='.'):
        """ 根据数据库中的信息,返回目录树

        :param str root: 起始路径
        :return: 路径树
        :rtype: list
        """
        # 这里result_set的用途是排除重合的路径
        result_set = set()
        raw_result = self._raw_path_tree(root)
        result = []
        for item in raw_result:
            if item not in result_set:
                result.append(item)
                result_set.add(item)

        return result

    def delete_many(self,ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({'_id':ObjectId(item)})

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__path_db.close()