class LiteratureReport: def __init__(self,database=None,collection=None,file=None): self.literatures = None if database is not None: self.mongo = MongoDB() self.mongo.connect(database,collection) if file is not None: self.literatures = json.load(open(file)) def load_record_from_db(self,query=None,sort=None): if query is None: self.literatures = self.mongo.collection.find({}) else: if sort is None: self.literatures = self.mongo.collection.find(query) else: self.literatures = self.mongo.collection.find(query).sort(sort) def to_report(self): replace_word = {'articleTitle':'Literature Report', 'arcticleabstract':'Abstract'} doc = Article(r'E:\latex\template\article_template_02.tex',replace_word) for item in self.literatures: doc.document.add_section(item['title'],3) doc.document.add_list(['---'.join([item['journal'],item['year'],item['vol'],item['issue']])],type=1) abstract = item.get('abstract') if abstract is not None: doc.document.append(abstract) doc.document.generate_tex(r'E:\latex\journalreport') doc.document.generate_pdf(r'E:\latex\journalreport')
class ChinaJournalDatabase: """ ChinaJournalDatabase类用来连接中文文献数据库 """ def __init__(self,proxy=None): self.mongo = MongoDB() self.mongo.connect('publication','ChineseJournal') self.collection = self.mongo.collection def getByName(self,journal_name=None,exactly=False,auto=False): if auto: result = list(self.find(condition={'中文名称':journal_name})) if len(list(result)) < 1: result = list(self.find(condition={'中文名称':{'$regex':journal_name}})) return result else: if exactly: return list(self.find(condition={'中文名称':journal_name})) else: return list(self.find(condition={'中文名称':{'$regex':journal_name}})) def find(self,condition=None): """ 查询 :param dict condition: 查询条件 :return: 查询结果 """ return self.collection.find(condition) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.mongo.close()
class TagDB: """ TagDB类连接tagdb数据库 """ def __init__(self): # 初始化数据库集合 self.__tag_db = MongoDB() self.__tag_db.connect('FileIndex', 'tagdb') self.collection = self.__tag_db.collection def update(self,tag): """ 更新tagdb数据库 :param dict tag: tag字典 :return: 数据库集合tagdb中查询得到的tagid :rtype: bson.objectid.ObjectId对象 """ tag_found = self.collection.find_one({'tag':tag['tag']}) if tag_found is not None: tagid = tag_found['_id'] tag_found.pop('_id') tag_found['files'] = '|'.join([str(item) for item in tag_found['files']]) tag['files'] = '|'.join([str(item) for item in tag['files']]) difference = dict(list(tag.items() - tag_found.items())) if len(difference) > 0: self.collection.find_one_and_update({'_id':tagid}, {'$set':{'files':tag['files']}}) #{'$set':{'files':[ObjectId(item) for item in re.split('\|',difference['files'])]}}) return tagid else: self.collection.insert_one(tag) return None def delete_many(self,ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({'_id':ObjectId(item)}) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__tag_db.close()
def get_ISSN_from_journal(self,auto=True): mongo = MongoDB() mongo.connect('publication','WesternJournal') if auto: result = list(mongo.collection.find({'journal':self.journal.upper()})) if len(result) < 1: result = list(mongo.collection.find({'journal':{'$regex':self.journal.upper()}})) if len(result) < 1: return None else: ISSN = result[0]['SSIN'] return ISSN else: result = list(mongo.collection.find({'journal':self.journal.upper()})) if len(result) < 1: return None else: ISSN = result[0]['SSIN'] return ISSN
class ElisePaper: def __init__(self): self.mongo = MongoDB() province_economy = Excel('E:\\data\\college\\province.xlsx').read() self.province_economy_dict = dict([(str(int(item[0])),item[2]) for item in province_economy[1:]]) self.mongo.connect('region','province') self.province_dict = dict([(item['acode'],item['region']) for item in self.mongo.collection.find({})]) def get_data_by_region(self,acode,subject='理科'): mresult = [['学校','类型','录取平均分','学校地区','学校评分','地区收入']] self.mongo.connect('college','entranceexam') query_result = self.mongo.collection.find({'student_region':acode,'subject':subject}) self.mongo.connect('college','collegeinfo') for item in query_result: rating = self.mongo.collection.find_one({'name':item['university']})['rating'] income = self.province_economy_dict[item['university_region']] mresult.append([item['university'],item['type'],item['average_score'],item['university_region'], rating,income]) return mresult def get_college_data(self): self.mongo.connect('college','entranceexam') colleges = self.mongo.collection.find({}).distinct('university') self.mongo.connect('college','collegeinfo') records = self.mongo.collection.find({'name':{'$in':colleges}}, projection={'_id':0,'name':1,'rating':1,'score':1,'region':1,'project':1,'type':1}) result = [record for record in records if len(record['region']) < 2] result = dict([(item['rating'],item) for item in result]) mresult = [['排名','学校','类型','评分','地区','地区可支配收入']] for item in sorted(result): region = result[item]['region'][0] region_income = self.province_economy_dict[region] mresult.append([result[item]['rating'],result[item]['name'], result[item]['type'],result[item]['score'], region,region_income]) return mresult def close(self): self.mongo.close()
class FileDB: """ FileDB类连接filedb数据库 """ def __init__(self): # 初始化数据库集合 self.__file_db = MongoDB() self.__file_db.connect("FileIndex", "filedb") self.collection = self.__file_db.collection # pathdb数据集合 self.__path_db = PathDB() def update(self, file, path): """ 根据file,更新文件库信息 :param file: 文件对象 :param path: 路径对象 :return: 返回当前文件在数据库中的id,如果数据库中无当前文件信息,返回None :rtype: bson.objectid.ObjectId对象 """ # 获得数据库中该文件的路径信息 path_found = self.__path_db.collection.find_one({"path": path.relative_path}) if path_found is None: print("Path wrong!") raise FileNotFoundError # 获得文件的信息 file_found = self.collection.find_one( { "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ) } ) # 如果数据库里没有相关文件信息,则插入此文件信息;若有,比较两者是否一致。 if file_found is not None: # 变量fid是数据库中相关文件信息中的_id fid = file_found["_id"] # 目录中文件信息与数据库相关文件信息的差异 difference = dict( list( self.make_document(file, path, path_found, True).items() - self.make_document_from_db(file_found).items() ) ) # 更改tags和projects的格式 if "tags" in difference: difference["tags"] = re.split("\|", difference["tags"]) if "projects" in difference: difference["projects"] = re.split("\|", difference["projects"]) if "last_modified" in difference: difference["last_modified"] = file.parser.last_modified if "time" in difference: difference["time"] = file.parser.time # 若存在差异,则更新数据库中的信息 if len(difference) > 0: self.collection.insert_one(self.make_document(file, path, path_found, False)) return None else: # self.collection.find_one_and_update({'_id':fid},{'$set':difference}) return fid else: # 若数据库中无此文件信息,那么插入此信息 self.collection.insert_one(self.make_document(file, path, path_found, False)) return None @classmethod def make_document(cls, file, path, path_found, for_comparison=False): """ 根据file,path以及数据库中查询得到的path_found,创建符合数据库filedb集合标准格式的文档 :param File file: 文件对象 :param Path path: 路径对象 :param dict path_found: 数据库中查询得到的路径文档 :param bool for_comparison: 是否是用来进行比较 :return: 数据文档 :rtype: dict """ if for_comparison: document = { "full_file_name": os.path.join(path.relative_path, file.parser.path_name), "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ), "special_characters": file.parser.special_character_part, "file_name": file.parser.path_name, "directory": path_found["_id"], "extension": file.parser.extension, "last_modified": file.parser.last_modified.ctime(), "size": len(file), "author": file.parser.author, "version": file.parser.version, } if file.parser.time is not None: document["time"] = file.parser.time.ctime() else: document["time"] = None if file.parser.tags is not None: document["tags"] = "|".join(file.parser.tags) else: document["tags"] = None if file.parser.projects is not None: document["projects"] = "|".join(file.parser.projects) else: document["projects"] = None else: document = { "full_file_name": os.path.join(path.relative_path, file.parser.path_name), "full_file_name_without_sc": os.path.join( path.relative_path, file.parser.path_name_without_special_characters ), "special_characters": file.parser.special_character_part, "file_name": file.parser.path_name, "directory": path_found["_id"], "extension": file.parser.extension, "last_modified": file.parser.last_modified, "size": len(file), "author": file.parser.author, "time": file.parser.time, "version": file.parser.version, "tags": file.parser.tags, "projects": file.parser.projects, } return document @classmethod def make_document_from_db(cls, file): """ 根据数据库中的文件文档对象创建新文档,进行比较 :param dict file: 文件对象 :return: 文档 :rtype: dict """ record = file record.pop("_id") record["last_modified"] = record["last_modified"].ctime() if record["time"] is not None: record["time"] = record["time"].ctime() else: record["time"] = None if record["tags"] is not None: record["tags"] = "|".join(record["tags"]) else: record["tags"] = None if record["projects"] is not None: record["projects"] = "|".join(record["projects"]) else: record["projects"] = None return record def delete_many(self, ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({"_id": ObjectId(item)}) def make_tag_document(self): """ 根据filedb数据库中的信息,生成tagdb中所需要的标准格式文档 :return: 标签文档 :rtype: defaultdict对象 """ tags = defaultdict(list) tag_items = self.collection.find({}, {"_id": 1, "tags": 1}) for item in tag_items: for tag in item["tags"]: tags[tag].append(item["_id"]) return tags def get_files_according_to_path_list(self, path_list): """ 根据目录列表,补充文件信息 :param list path_list: 目录列表 :return: 完整的目录文件字典 :rtype: OrderedDict对象 """ result = OrderedDict() for path in path_list: path_found = self.__path_db.collection.find_one({"path": path}) files_found = self.collection.find({"directory": path_found["_id"]}) result[path] = [file["file_name"] for file in files_found] return result def find_and_open(self, base_path, temp_path, **condition): """ 根据condition条件查询filedb数据库,复制查询得到的文件到temp_path,并且打开temp_path文件窗口 :param str base_path: :param str temp_path: :param dict condition: :return: 返回成功或失败信息 :rtype: str """ result = list(self.collection.find(condition)) if len(result) < 1: return "None is found!" else: for item in result: source_file = os.path.join(base_path, item["full_file_name"]) destination_file = os.path.join(temp_path, item["file_name"]) OSOperator.copy_to(source_file, destination_file) os.startfile(temp_path) return "successfully!" def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__file_db.close()
# coding=UTF-8 from libs.database.class_mongodb import MongoDB from libs.file.class_Excel import Excel mongo = MongoDB() mongo.connect('paper','literature') result = mongo.collection.find({}).distinct('journal') print(result) '''' result = mongo.collection.find({'journal':'Journal of Econometrics', 'keyword':{'$exists':True}}) keyword = set() for item in result: keyword.update([item.lower() for item in item['keyword']]) mdata = [[item] for item in sorted(keyword)] outfile = r'd:\down\keywords.xlsx' moutexcel = Excel(outfile) moutexcel.new().append(mdata, 'mysheet') moutexcel.close()''' result = mongo.collection.find({'keyword':'ARMA'}) result = mongo.collection.find({'keyword':{'$regex':'^(a|A)utocorrelation$'}}) for item in result: print(item['keyword'])
# coding=UTF-8 from libs.database.class_mongodb import MongoDB from collections import deque # 1. 连接数据库集合 path_db, file_db, tag_db = MongoDB(), MongoDB(), MongoDB() path_db.connect('FileIndex','pathdb') file_db.connect('FileIndex','filedb') tag_db.connect('FileIndex','tagdb') # 2. 返回文件夹树 paths = deque(path_db.collection.find({'path':'.'})) print(paths) while paths: newpath = paths.pop() print(newpath['path']) paths.extend([path_db.collection.find_one({'_id':item}) for item in reversed(newpath['children_id'])])
from libs.database.class_mongodb import MongoDB from libs.latex.class_article import Article # 1. 配置初始参数 PROXY_LIST = ['58.20.128.123:80', '36.7.151.29:8000', '61.174.13.12:80', '112.90.179.153:4040', '58.20.235.180:8000', '58.22.86.44:8000', '101.226.249.237:80', '111.1.89.254:80', '112.16.87.24:80'] PROXY_LIST = ['111.56.13.150:80', '115.159.5.247:8080', '117.136.234.6:843', '60.191.179.53:3128', '60.191.163.235:3128', '120.52.73.33:80'] QUERY_STRING = "JN='经济研究'" START_PERIOD = "2010" END_PERIOD = "2016" SUBJECTS = ["经济与管理科学","社会科学Ⅱ辑"] LITERATURE_JSON_FILE = r"E:\gitrobot\files\literature\literature_list.txt" db = MongoDB() db.connect('publication','ChineseJournal') journals = db.collection.find({},projection={'_id':0,'中文名称':1,'复合影响因子':1}) jours = dict([(journal['中文名称'],journal.get('复合影响因子')) for journal in journals]) jours_set = jours.keys() STEP_ONE = False STEP_TWO = True # 2. 进行CNKI网站操作 if STEP_ONE: cnki_obj = Cnki() #cnki_obj = Cnki(PROXY_LIST[random.randint(0,len(PROXY_LIST)-1)]) cnki_obj.set_query(QUERY_STRING) cnki_obj.set_period(start_period=START_PERIOD,end_period=END_PERIOD) cnki_obj.set_subject(subjects=SUBJECTS) cnki_obj.submit()
class PathDB: """ PathDB类连接pathdb数据库 """ def __init__(self): # 初始化数据库集合 self.__path_db = MongoDB() self.__path_db.connect('FileIndex', 'pathdb') self.collection = self.__path_db.collection def update(self,path): """ 更新路径 :param Path path: Path类对象,提供路径信息 :return: 数据库匹配得到的路径文档的_id,若数据库中不存在此路径,则返回None :rtype: ObjectId类对象或者None """ # 根据相对路径寻找数据库中匹配的文档 document_in_db = self.collection.find_one({'path':path.relative_path}) # 若没有找到匹配的文档,则插入当前路径信息文档 if document_in_db is None: self.collection.insert_one(self.make_document(path,False)) return None else: # 更新数据库路径文档 did = document_in_db['_id'] document_in_db.pop('_id') document_in_db.pop('children_id') document_in_db['last_modified'] = document_in_db['last_modified'].ctime() difference = dict(list(self.make_document(path,True).items() - document_in_db.items())) if 'last_modified' in difference: difference['last_modified'] = path.parser.last_modified if len(difference) > 0: self.collection.find_one_and_update({'_id':did},{'$set':difference}) return did def traverse_and_update(self): """ 遍历更新路径文档的子目录信息,即childre_id。 这里分两个步骤,其一是遍历数据库,通过原文档的父路径id,查询父路径文档,并把源文档的id添加到父路径文档的children_id中; 其二是遍历数据库,通过源文档的子路径id列表,查询子路径文档,并验证该子路径是否存在 :return: 无返回值 """ # 遍历数据库 for record in self.collection.find({}): # 获得单个记录的父路径id record_parent_path_id = record['parent_path_id'] if record_parent_path_id is not None: # 查询父路径id指向的记录,则指向的记录是当前记录的父路径记录,那么该父路径的子路径集中必须有当前记录信息 children_id = set(self.collection.find_one({'_id':record_parent_path_id})['children_id']) if record['_id'] not in children_id: self.collection.update_one({'_id':record_parent_path_id}, {'$addToSet':{'children_id':record.get('_id')}},upsert =True) # 遍历数据库 for record in self.collection.find({}): # 如果存在子目录集合,那么查询每个子目录是否存在 if len(record['children_id']) > 0: for pid in record['children_id']: path_found = self.collection.find_one({'_id':pid}) if path_found is None: self.collection.update_one({'_id':record['_id']}, {'$pull':{'children_id':pid}}) continue if path_found['parent_path_id'] != record['_id']: self.collection.update_one({'_id':record['_id']}, {'$pull':{'children_id':pid}}) def make_document(self,path,for_comparison=False): """ 根据path创建标准格式的数据库pathdb集合中的文档 :param path: :param for_comparision: :return: """ # 数据库中文档形式如下: # { # path: 相对路径 # path_name: 当前路径名称 # parent_path_id: 相对父路径 # last_modified: 最近修改时间 # children_id: 子目录列表 # } if for_comparison: # 根目录是.,如果是根目录,需要设置parent_path_id为None if re.match('^\.$',path.relative_path) is not None: document = {'path':path.relative_path, 'path_name':path.relative_path, 'parent_path_id':None, 'last_modified':path.parser.last_modified.ctime()} else: parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id'] document = {'path':path.relative_path, 'path_name': path.current_path, 'parent_path_id':parent_id, 'last_modified':path.parser.last_modified.ctime()} else: # 根目录是.,如果是根目录,需要设置parent_path_id为None if re.match('^\.$',path.relative_path) is not None: document = {'path':path.relative_path, 'path_name':path.relative_path, 'parent_path_id':None, 'last_modified':path.parser.last_modified, 'children_id':[]} else: parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id'] document = {'path':path.relative_path, 'path_name': path.current_path, 'parent_path_id':parent_id, 'last_modified':path.parser.last_modified, 'children_id':[]} return document def _raw_path_tree(self,root='.'): """ 辅助函数,根据数据库中的信息,返回目录树 :param str root: 起始路径 :return: 路径树 :rtype: list """ result = [] # 查询起始路径 root_path = self.collection.find_one({'path':root}) # 查询子路径集合 child_path = list(self.collection.find({'parent_path_id':root_path['_id']})) # 如果没有子路径,则返回当前路径的集合 if len(child_path) < 1: return [root_path['path']] # 否则,添加子路径及以子路径为起始路径的下属路径 else: for item in child_path: result.append(item['path']) result.extend(self._raw_path_tree(item['path'])) return result def path_tree(self,root='.'): """ 根据数据库中的信息,返回目录树 :param str root: 起始路径 :return: 路径树 :rtype: list """ # 这里result_set的用途是排除重合的路径 result_set = set() raw_result = self._raw_path_tree(root) result = [] for item in raw_result: if item not in result_set: result.append(item) result_set.add(item) return result def delete_many(self,ids): """ 根据id删除数据库中的文档 :param ids: 数据库中文档的_id列表 :return: 无返回值 """ for item in ids: self.collection.delete_one({'_id':ObjectId(item)}) def close(self): """ 关闭数据库连接 :return: 无返回值 """ self.__path_db.close()
# coding=UTF-8 import json import re from libs.file.class_Excel import Excel from libs.database.class_mongodb import MongoDB import os.path mongo = MongoDB() mongo.connect('region','province') filename_college = r'E:\data\college\college_rating.xlsx' filename_province = r'E:\data\college\province.xlsx' college_rating = Excel(file_name=filename_college) province = Excel(file_name=filename_province) province_dict = dict([(item['region'],item['acode']) for item in mongo.collection.find({})]) def to_acode(province_dict,regions): if isinstance(regions,str): if '/' in regions: regions = re.split('/',regions) else: regions = [regions] result = [] found = False all_found = 0 for region in regions: for province in province_dict:
result = [] journals = json.load(open(r'E:\gitrobot\files\publication\ssci_geography_json.txt')) for journal in journals: if journal[1] not in impact_factor_journals: result.append([journal[0].upper(),journal[1],None]) else: result.append([journal[0].upper(),journal[1],impact_factor_journals[journal[1]]]) # 2. output for record in result: print(record) outfile = r'd:\down\tmp_journal.xlsx' moutexcel = Excel(outfile) moutexcel.new().append(result, 'sheet1') moutexcel.close()''' mongo = MongoDB() mongo.connect('publication','WesternJournal') filename = r'd:\down\journals.xlsx' mexcel = Excel(filename) mdata = mexcel.read(sheet=4) result = [] for item in mdata[1:]: if item[2] == '': result.append({'journal':item[0],'SSIN':item[1],'IF':None}) else: result.append({'journal':item[0],'SSIN':item[1],'IF':item[2]}) #for j in result: # mongo.collection.insert_one(j)
a_journal[name] = float(value) continue a_journal[name] = re.sub('\s+','',value) for j in journal: print(j) out_file = r'E:\gitrobot\files\literature\jjournals_cssci.txt' json.dump(journal, fp=open(out_file,'w')) browser.quit() ''' mongo = MongoDB() mongo.connect('publication','ChineseJournal') ''' literatures = json.load(open(r'E:\gitrobot\files\literature\journals_cssci.txt')) for l in literatures: print(l) #mongo.collection.insert_one(l) print(len(literatures))''' proxy_list = ['101.26.38.162:82'] proxy_list = ['111.56.13.152:80', '101.26.38.162:80', '101.26.38.162:82', '111.56.13.150:80', '60.191.157.155:3128', '60.191.175.54:3128', '60.191.167.93:3128', '61.163.32.6:3128', '49.1.244.139:3128', '112.16.76.188:8080', '60.191.163.147:3128', '60.194.100.51:80', '101.226.12.223:80', '82.200.81.233:80', '85.143.24.70:80', '59.58.162.141:888', '110.18.241.9:3128', '60.15.41.214:3128', '61.7.149.69:8080', '61.184.199.203:3128', '86.100.118.44:81', '61.150.89.67:3128', '61.162.223.41:9797', '95.168.217.24:3128', '86.100.118.44:80', '31.173.74.73:8080', '58.248.137.228:80', '79.120.72.222:3128', '46.218.85.101:3129', '106.56.225.200:3128', '60.15.55.228:3128', '60.13.74.184:81', '101.200.234.114:8080', '104.238.83.28:443', '91.183.124.41:80', '60.191.164.22:3128', '62.204.241.146:8000', '60.191.174.227:3128', '60.191.153.12:3128', '61.53.65.52:3128', '36.250.69.4:80', '61.153.198.178:3128', '60.191.153.75:3128', '60.191.178.43:3128', '60.13.74.184:82', '60.13.74.184:80', '60.191.161.244:3128', '60.191.170.122:3128', '60.191.167.11:3128', '61.175.220.4:3128', '61.164.92.254:9999', '61.75.2.124:3128', '27.122.12.45:3128', '64.62.233.67:80', '113.140.43.51:3128', '60.191.166.130:3128', '113.107.57.76:8101', '113.107.57.76:80', '60.191.160.20:3128', '61.134.34.148:3128', '93.51.247.104:80', '60.191.164.59:3128', '91.142.84.182:3128', '72.252.11.91:8080', '59.44.244.14:9797', '58.18.50.10:3128', '58.96.187.208:3128', '85.194.75.18:8080', '113.105.80.61:3128', '58.59.141.187:3128', '61.163.45.240:3128', '91.108.131.250:8080', '110.17.172.150:3128'] #browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)]) #browser = AutoBrowser(proxy='101.26.38.162:82') browser = AutoBrowser() browser.surf('http://navi.cnki.net/knavi/journal/Detailq/CJFD/JJYJ?Year=&Issue=&Entry=', ready_check=(By.CSS_SELECTOR,'#bottom'))