class ChinaJournalDatabase: """ ChinaJournalDatabase类用来连接中文文献数据库 """ def __init__(self,proxy=None): self.mongo = MongoDB() self.mongo.connect('publication','ChineseJournal') self.collection = self.mongo.collection def getByName(self,journal_name=None,exactly=False,auto=False): if auto: result = list(self.find(condition={'中文名称':journal_name})) if len(list(result)) < 1: result = list(self.find(condition={'中文名称':{'$regex':journal_name}})) return result else: if exactly: return list(self.find(condition={'中文名称':journal_name})) else: return list(self.find(condition={'中文名称':{'$regex':journal_name}})) def find(self,condition=None): """ 查询 :param dict condition: 查询条件 :return: 查询结果 """ return self.collection.find(condition) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.mongo.close()
class TagDB: """ TagDB类连接tagdb数据库 """ def __init__(self): # 初始化数据库集合 self.__tag_db = MongoDB() self.__tag_db.connect('FileIndex', 'tagdb') self.collection = self.__tag_db.collection def update(self,tag): """ 更新tagdb数据库 :param dict tag: tag字典 :return: 数据库集合tagdb中查询得到的tagid :rtype: bson.objectid.ObjectId对象 """ tag_found = self.collection.find_one({'tag':tag['tag']}) if tag_found is not None: tagid = tag_found['_id'] tag_found.pop('_id') tag_found['files'] = '|'.join([str(item) for item in tag_found['files']]) tag['files'] = '|'.join([str(item) for item in tag['files']]) difference = dict(list(tag.items() - tag_found.items())) if len(difference) > 0: self.collection.find_one_and_update({'_id':tagid}, {'$set':{'files':tag['files']}}) #{'$set':{'files':[ObjectId(item) for item in re.split('\|',difference['files'])]}}) return tagid else: self.collection.insert_one(tag) return None def delete_many(self,ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({'_id':ObjectId(item)}) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__tag_db.close()
class ElisePaper: def __init__(self): self.mongo = MongoDB() province_economy = Excel('E:\\data\\college\\province.xlsx').read() self.province_economy_dict = dict([(str(int(item[0])),item[2]) for item in province_economy[1:]]) self.mongo.connect('region','province') self.province_dict = dict([(item['acode'],item['region']) for item in self.mongo.collection.find({})]) def get_data_by_region(self,acode,subject='理科'): mresult = [['学校','类型','录取平均分','学校地区','学校评分','地区收入']] self.mongo.connect('college','entranceexam') query_result = self.mongo.collection.find({'student_region':acode,'subject':subject}) self.mongo.connect('college','collegeinfo') for item in query_result: rating = self.mongo.collection.find_one({'name':item['university']})['rating'] income = self.province_economy_dict[item['university_region']] mresult.append([item['university'],item['type'],item['average_score'],item['university_region'], rating,income]) return mresult def get_college_data(self): self.mongo.connect('college','entranceexam') colleges = self.mongo.collection.find({}).distinct('university') self.mongo.connect('college','collegeinfo') records = self.mongo.collection.find({'name':{'$in':colleges}}, projection={'_id':0,'name':1,'rating':1,'score':1,'region':1,'project':1,'type':1}) result = [record for record in records if len(record['region']) < 2] result = dict([(item['rating'],item) for item in result]) mresult = [['排名','学校','类型','评分','地区','地区可支配收入']] for item in sorted(result): region = result[item]['region'][0] region_income = self.province_economy_dict[region] mresult.append([result[item]['rating'],result[item]['name'], result[item]['type'],result[item]['score'], region,region_income]) return mresult def close(self): self.mongo.close()
class FileDB: """ FileDB类连接filedb数据库 """ def __init__(self): # 初始化数据库集合 self.__file_db = MongoDB() self.__file_db.connect("FileIndex", "filedb") self.collection = self.__file_db.collection # pathdb数据集合 self.__path_db = PathDB() def update(self, file, path): """ 根据file,更新文件库信息 :param file: 文件对象 :param path: 路径对象 :return: 返回当前文件在数据库中的id,如果数据库中无当前文件信息,返回None :rtype: bson.objectid.ObjectId对象 """ # 获得数据库中该文件的路径信息 path_found = self.__path_db.collection.find_one({"path": path.relative_path}) if path_found is None: print("Path wrong!") raise FileNotFoundError # 获得文件的信息 file_found = self.collection.find_one( { "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ) } ) # 如果数据库里没有相关文件信息,则插入此文件信息;若有,比较两者是否一致。 if file_found is not None: # 变量fid是数据库中相关文件信息中的_id fid = file_found["_id"] # 目录中文件信息与数据库相关文件信息的差异 difference = dict( list( self.make_document(file, path, path_found, True).items() - self.make_document_from_db(file_found).items() ) ) # 更改tags和projects的格式 if "tags" in difference: difference["tags"] = re.split("\|", difference["tags"]) if "projects" in difference: difference["projects"] = re.split("\|", difference["projects"]) if "last_modified" in difference: difference["last_modified"] = file.parser.last_modified if "time" in difference: difference["time"] = file.parser.time # 若存在差异,则更新数据库中的信息 if len(difference) > 0: self.collection.insert_one(self.make_document(file, path, path_found, False)) return None else: # self.collection.find_one_and_update({'_id':fid},{'$set':difference}) return fid else: # 若数据库中无此文件信息,那么插入此信息 self.collection.insert_one(self.make_document(file, path, path_found, False)) return None @classmethod def make_document(cls, file, path, path_found, for_comparison=False): """ 根据file,path以及数据库中查询得到的path_found,创建符合数据库filedb集合标准格式的文档 :param File file: 文件对象 :param Path path: 路径对象 :param dict path_found: 数据库中查询得到的路径文档 :param bool for_comparison: 是否是用来进行比较 :return: 数据文档 :rtype: dict """ if for_comparison: document = { "full_file_name": os.path.join(path.relative_path, file.parser.path_name), "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ), "special_characters": file.parser.special_character_part, "file_name": file.parser.path_name, "directory": path_found["_id"], "extension": file.parser.extension, "last_modified": file.parser.last_modified.ctime(), "size": len(file), "author": file.parser.author, "version": file.parser.version, } if file.parser.time is not None: document["time"] = file.parser.time.ctime() else: document["time"] = None if file.parser.tags is not None: document["tags"] = "|".join(file.parser.tags) else: document["tags"] = None if file.parser.projects is not None: document["projects"] = "|".join(file.parser.projects) else: document["projects"] = None else: document = { "full_file_name": os.path.join(path.relative_path, file.parser.path_name), "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ), "special_characters": file.parser.special_character_part, "file_name": file.parser.path_name, "directory": path_found["_id"], "extension": file.parser.extension, "last_modified": file.parser.last_modified, "size": len(file), "author": file.parser.author, "time": file.parser.time, "version": file.parser.version, "tags": file.parser.tags, "projects": file.parser.projects, } return document @classmethod def make_document_from_db(cls, file): """ 根据数据库中的文件文档对象创建新文档,进行比较 :param dict file: 文件对象 :return: 文档 :rtype: dict """ record = file record.pop("_id") record["last_modified"] = record["last_modified"].ctime() if record["time"] is not None: record["time"] = record["time"].ctime() else: record["time"] = None if record["tags"] is not None: record["tags"] = "|".join(record["tags"]) else: record["tags"] = None if record["projects"] is not None: record["projects"] = "|".join(record["projects"]) else: record["projects"] = None return record def delete_many(self, ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({"_id": ObjectId(item)}) def make_tag_document(self): """ 根据filedb数据库中的信息,生成tagdb中所需要的标准格式文档 :return: 标签文档 :rtype: defaultdict对象 """ tags = defaultdict(list) tag_items = self.collection.find({}, {"_id": 1, "tags": 1}) for item in tag_items: for tag in item["tags"]: tags[tag].append(item["_id"]) return tags def get_files_according_to_path_list(self, path_list): """ 根据目录列表,补充文件信息 :param list path_list: 目录列表 :return: 完整的目录文件字典 :rtype: OrderedDict对象 """ result = OrderedDict() for path in path_list: path_found = self.__path_db.collection.find_one({"path": path}) files_found = self.collection.find({"directory": path_found["_id"]}) result[path] = [file["file_name"] for file in files_found] return result def find_and_open(self, base_path, temp_path, **condition): """ 根据condition条件查询filedb数据库,复制查询得到的文件到temp_path,并且打开temp_path文件窗口 :param str base_path: :param str temp_path: :param dict condition: :return: 返回成功或失败信息 :rtype: str """ result = list(self.collection.find(condition)) if len(result) < 1: return "None is found!" else: for item in result: source_file = os.path.join(base_path, item["full_file_name"]) destination_file = os.path.join(temp_path, item["file_name"]) OSOperator.copy_to(source_file, destination_file) os.startfile(temp_path) return "successfully!" def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__file_db.close()
class PathDB: """ PathDB类连接pathdb数据库 """ def __init__(self): # 初始化数据库集合 self.__path_db = MongoDB() self.__path_db.connect('FileIndex', 'pathdb') self.collection = self.__path_db.collection def update(self,path): """ 更新路径 :param Path path: Path类对象,提供路径信息 :return: 数据库匹配得到的路径文档的_id,若数据库中不存在此路径,则返回None :rtype: ObjectId类对象或者None """ # 根据相对路径寻找数据库中匹配的文档 document_in_db = self.collection.find_one({'path':path.relative_path}) # 若没有找到匹配的文档,则插入当前路径信息文档 if document_in_db is None: self.collection.insert_one(self.make_document(path,False)) return None else: # 更新数据库路径文档 did = document_in_db['_id'] document_in_db.pop('_id') document_in_db.pop('children_id') document_in_db['last_modified'] = document_in_db['last_modified'].ctime() difference = dict(list(self.make_document(path,True).items() - document_in_db.items())) if 'last_modified' in difference: difference['last_modified'] = path.parser.last_modified if len(difference) > 0: self.collection.find_one_and_update({'_id':did},{'$set':difference}) return did def traverse_and_update(self): """ 遍历更新路径文档的子目录信息,即childre_id。 这里分两个步骤,其一是遍历数据库,通过原文档的父路径id,查询父路径文档,并把源文档的id添加到父路径文档的children_id中; 其二是遍历数据库,通过源文档的子路径id列表,查询子路径文档,并验证该子路径是否存在 :return: 无返回值 """ # 遍历数据库 for record in self.collection.find({}): # 获得单个记录的父路径id record_parent_path_id = record['parent_path_id'] if record_parent_path_id is not None: # 查询父路径id指向的记录,则指向的记录是当前记录的父路径记录,那么该父路径的子路径集中必须有当前记录信息 children_id = set(self.collection.find_one({'_id':record_parent_path_id})['children_id']) if record['_id'] not in children_id: self.collection.update_one({'_id':record_parent_path_id}, {'$addToSet':{'children_id':record.get('_id')}},upsert =True) # 遍历数据库 for record in self.collection.find({}): # 如果存在子目录集合,那么查询每个子目录是否存在 if len(record['children_id']) > 0: for pid in record['children_id']: path_found = self.collection.find_one({'_id':pid}) if path_found is None: self.collection.update_one({'_id':record['_id']}, {'$pull':{'children_id':pid}}) continue if path_found['parent_path_id'] != record['_id']: self.collection.update_one({'_id':record['_id']}, {'$pull':{'children_id':pid}}) def make_document(self,path,for_comparison=False): """ 根据path创建标准格式的数据库pathdb集合中的文档 :param path: :param for_comparision: :return: """ # 数据库中文档形式如下: # { # path: 相对路径 # path_name: 当前路径名称 # parent_path_id: 相对父路径 # last_modified: 最近修改时间 # children_id: 子目录列表 # } if for_comparison: # 根目录是.,如果是根目录,需要设置parent_path_id为None if re.match('^\.$',path.relative_path) is not None: document = {'path':path.relative_path, 'path_name':path.relative_path, 'parent_path_id':None, 'last_modified':path.parser.last_modified.ctime()} else: parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id'] document = {'path':path.relative_path, 'path_name': path.current_path, 'parent_path_id':parent_id, 'last_modified':path.parser.last_modified.ctime()} else: # 根目录是.,如果是根目录,需要设置parent_path_id为None if re.match('^\.$',path.relative_path) is not None: document = {'path':path.relative_path, 'path_name':path.relative_path, 'parent_path_id':None, 'last_modified':path.parser.last_modified, 'children_id':[]} else: parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id'] document = {'path':path.relative_path, 'path_name': path.current_path, 'parent_path_id':parent_id, 'last_modified':path.parser.last_modified, 'children_id':[]} return document def _raw_path_tree(self,root='.'): """ 辅助函数,根据数据库中的信息,返回目录树 :param str root: 起始路径 :return: 路径树 :rtype: list """ result = [] # 查询起始路径 root_path = self.collection.find_one({'path':root}) # 查询子路径集合 child_path = list(self.collection.find({'parent_path_id':root_path['_id']})) # 如果没有子路径,则返回当前路径的集合 if len(child_path) < 1: return [root_path['path']] # 否则,添加子路径及以子路径为起始路径的下属路径 else: for item in child_path: result.append(item['path']) result.extend(self._raw_path_tree(item['path'])) return result def path_tree(self,root='.'): """ 根据数据库中的信息,返回目录树 :param str root: 起始路径 :return: 路径树 :rtype: list """ # 这里result_set的用途是排除重合的路径 result_set = set() raw_result = self._raw_path_tree(root) result = [] for item in raw_result: if item not in result_set: result.append(item) result_set.add(item) return result def delete_many(self,ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({'_id':ObjectId(item)}) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__path_db.close()