Пример #1
0
class LiteratureReport:
    def __init__(self,database=None,collection=None,file=None):
        self.literatures = None
        if database is not None:
            self.mongo = MongoDB()
            self.mongo.connect(database,collection)

        if file is not None:
            self.literatures = json.load(open(file))

    def load_record_from_db(self,query=None,sort=None):
        if query is None:
            self.literatures = self.mongo.collection.find({})
        else:
            if sort is None:
                self.literatures = self.mongo.collection.find(query)
            else:
                self.literatures = self.mongo.collection.find(query).sort(sort)

    def to_report(self):
        replace_word = {'articleTitle':'Literature Report',
                    'arcticleabstract':'Abstract'}
        doc = Article(r'E:\latex\template\article_template_02.tex',replace_word)
        for item in self.literatures:
            doc.document.add_section(item['title'],3)
            doc.document.add_list(['---'.join([item['journal'],item['year'],item['vol'],item['issue']])],type=1)
            abstract = item.get('abstract')
            if abstract is not None:
                doc.document.append(abstract)
        doc.document.generate_tex(r'E:\latex\journalreport')
        doc.document.generate_pdf(r'E:\latex\journalreport')
Пример #2
0
class ChinaJournalDatabase:
    """ ChinaJournalDatabase类用来连接中文文献数据库

    """
    def __init__(self,proxy=None):
        self.mongo = MongoDB()
        self.mongo.connect('publication','ChineseJournal')
        self.collection = self.mongo.collection

    def getByName(self,journal_name=None,exactly=False,auto=False):
        if auto:
            result = list(self.find(condition={'中文名称':journal_name}))
            if len(list(result)) < 1:
                result = list(self.find(condition={'中文名称':{'$regex':journal_name}}))
            return result
        else:
            if exactly:
                return list(self.find(condition={'中文名称':journal_name}))
            else:
                return list(self.find(condition={'中文名称':{'$regex':journal_name}}))

    def find(self,condition=None):
        """ 查询

        :param dict condition: 查询条件
        :return: 查询结果
        """
        return self.collection.find(condition)

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.mongo.close()
Пример #3
0
 def __init__(self):
     # 设置数据库
     mongo = MongoDB(
         conn_str='mongodb://*****:*****@123.207.185.126:27017/')
     mdb = MonDatabase(mongodb=mongo, database_name='proxy')
     self._collection = MonCollection(database=mdb,
                                      collection_name='proxys')
Пример #4
0
 def __init__(self):
     # 初始化数据库集合
     self.__file_db = MongoDB()
     self.__file_db.connect("FileIndex", "filedb")
     self.collection = self.__file_db.collection
     # pathdb数据集合
     self.__path_db = PathDB()
Пример #5
0
class TagDB:
    """ TagDB类连接tagdb数据库

    """
    def __init__(self):
        # 初始化数据库集合
        self.__tag_db = MongoDB()
        self.__tag_db.connect('FileIndex', 'tagdb')
        self.collection = self.__tag_db.collection

    def update(self,tag):
        """ 更新tagdb数据库

        :param dict tag: tag字典
        :return: 数据库集合tagdb中查询得到的tagid
        :rtype: bson.objectid.ObjectId对象
        """
        tag_found = self.collection.find_one({'tag':tag['tag']})
        if tag_found is not None:
            tagid = tag_found['_id']
            tag_found.pop('_id')
            tag_found['files'] = '|'.join([str(item) for item in tag_found['files']])
            tag['files'] = '|'.join([str(item) for item in tag['files']])
            difference = dict(list(tag.items() - tag_found.items()))
            if len(difference) > 0:
                self.collection.find_one_and_update({'_id':tagid}, {'$set':{'files':tag['files']}})
                                                    #{'$set':{'files':[ObjectId(item) for item in re.split('\|',difference['files'])]}})
            return tagid
        else:
            self.collection.insert_one(tag)
            return None

    def delete_many(self,ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({'_id':ObjectId(item)})

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__tag_db.close()
Пример #6
0
    def __init__(self,database=None,collection=None,file=None):
        self.literatures = None
        if database is not None:
            self.mongo = MongoDB()
            self.mongo.connect(database,collection)

        if file is not None:
            self.literatures = json.load(open(file))
Пример #7
0
    def __init__(self):
        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self._mdb = MonDatabase(mongodb=mongo, database_name='statsgov')

        # variable query website
        self._var_query_web = 'http://data.stats.gov.cn/adv.htm'
        self._var_query_web_params = {'m': 'findZbXl', 'wd': 'zb'}

        self._tags = {'年度全国':'hgnd', '年度地区':'fsnd'}
Пример #8
0
 def get_ISSN_from_journal(self,auto=True):
     mongo = MongoDB()
     mongo.connect('publication','WesternJournal')
     if auto:
         result = list(mongo.collection.find({'journal':self.journal.upper()}))
         if len(result) < 1:
             result = list(mongo.collection.find({'journal':{'$regex':self.journal.upper()}}))
         if len(result) < 1:
             return None
         else:
             ISSN = result[0]['SSIN']
             return ISSN
     else:
         result = list(mongo.collection.find({'journal':self.journal.upper()}))
         if len(result) < 1:
             return None
         else:
             ISSN = result[0]['SSIN']
             return ISSN
Пример #9
0
    def __init__(self):
        #设置代理服务器
        self._proxy_manager = ProxyManager()

        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self._mdb = MonDatabase(mongodb=mongo, database_name='statsgov')

        self._tags = {'年度全国': 'hgnd', '年度地区': 'fsnd'}
        self._stats_gov_url_template = 'http://data.stats.gov.cn/easyquery.htm?m={}&dbcode={}&rowcode={}&' \
                                       'colcode={}&wds={}&dfwds={}&k1=14930450350'
Пример #10
0
    def __init__(self, website=None, label=None):
        self.pmanager = ProxyManager()

        # 设置网站地址
        self.website = website

        # 设置数据库
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name='local')
        self.db = MonCollection(database=mdb, collection_name='scraper')

        # 设置标示
        self.label = label
Пример #11
0
    def __init__(self):
        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name='proxy')
        self._collection = MonCollection(database=mdb, collection_name='proxy')

        # 验证的网址
        self._checked_websites = [{'address':'http://www.163.com', 'title':'网易'},
                                  {'address':'http://www.sina.com.cn', 'title':'新浪首页'},
                                  {'address':'https://www.douban.com/', 'title':'豆瓣'},
                                  {'address':'http://www.sohu.com/', 'title':'搜狐'},
                                  {'address':'http://www.eastday.com/', 'title':'东方网'},
                                  {'address':'http://www.shanghaiairport.com/', 'title':'上海机场(集团)有限公司'}]

        # 设置检验完的代理服务器列表
        self._checked_proxy_list = dict()
Пример #12
0
    def insert_to_db(self,
                     literatures=None,
                     database='papers',
                     collection='cnki',
                     condition=None):
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name=database)
        conn = MonCollection(database=mdb, collection_name=collection)

        for record in literatures:
            print(record['title'], record['journal'], record['year'],
                  record['issue'], record.get('pages'))
            if record.get('author') is None:
                print('No author!')
                continue
            if condition is not None:
                if not condition[1] == record.get(condition[0]):
                    print('Journal not matched!->', condition[1],
                          record.get(condition[0]))
                    continue

            result = conn.collection.find_one({
                'title': record.get('title'),
                'journal': record.get('journal'),
                'year': record.get('year'),
                'issue': record.get('issue'),
                'pages': record.get('pages')
            })
            if result is None:
                print('Insert...!')
                conn.collection.insert_one(record)
            else:
                print('Update...!')
                conn.collection.update_one(
                    {
                        'title': record.get('title'),
                        'journal': record.get('journal'),
                        'year': record.get('year'),
                        'issue': record.get('issue'),
                        'pages': record.get('pages')
                    }, {
                        '$set': {
                            'cite': record.get('cite'),
                            'download': record.get('download')
                        }
                    })
result = []
journals = json.load(open(r'E:\gitrobot\files\publication\ssci_geography_json.txt'))
for journal in journals:
    if journal[1] not in impact_factor_journals:
        result.append([journal[0].upper(),journal[1],None])
    else:
        result.append([journal[0].upper(),journal[1],impact_factor_journals[journal[1]]])

# 2. output
for record in result:
    print(record)

outfile = r'd:\down\tmp_journal.xlsx'
moutexcel = Excel(outfile)
moutexcel.new().append(result, 'sheet1')
moutexcel.close()'''

mongo = MongoDB()
mongo.connect('publication','WesternJournal')
filename = r'd:\down\journals.xlsx'
mexcel = Excel(filename)
mdata = mexcel.read(sheet=4)
result = []
for item in mdata[1:]:
    if item[2] == '':
        result.append({'journal':item[0],'SSIN':item[1],'IF':None})
    else:
        result.append({'journal':item[0],'SSIN':item[1],'IF':item[2]})

#for j in result:
#    mongo.collection.insert_one(j)
Пример #14
0
 def __init__(self,proxy=None):
     self.mongo = MongoDB()
     self.mongo.connect('publication','ChineseJournal')
     self.collection = self.mongo.collection
Пример #15
0
# coding=UTF-8

import json
import re
from libs.file.class_Excel import Excel
from libs.database.class_mongodb import MongoDB
import os.path

mongo = MongoDB()
mongo.connect('region','province')

filename_college = r'E:\data\college\college_rating.xlsx'
filename_province = r'E:\data\college\province.xlsx'

college_rating = Excel(file_name=filename_college)
province = Excel(file_name=filename_province)

province_dict = dict([(item['region'],item['acode']) for item in mongo.collection.find({})])

def to_acode(province_dict,regions):
    if isinstance(regions,str):
        if '/' in regions:
            regions = re.split('/',regions)
        else:
            regions = [regions]

    result = []
    found = False
    all_found = 0
    for region in regions:
        for province in province_dict:
Пример #16
0
class PathDB:
    """ PathDB类连接pathdb数据库

    """
    def __init__(self):
        # 初始化数据库集合
        self.__path_db = MongoDB()
        self.__path_db.connect('FileIndex', 'pathdb')
        self.collection = self.__path_db.collection

    def update(self,path):
        """ 更新路径

        :param Path path: Path类对象,提供路径信息
        :return: 数据库匹配得到的路径文档的_id,若数据库中不存在此路径,则返回None
        :rtype: ObjectId类对象或者None
        """
        # 根据相对路径寻找数据库中匹配的文档
        document_in_db = self.collection.find_one({'path':path.relative_path})
        # 若没有找到匹配的文档,则插入当前路径信息文档
        if document_in_db is None:
            self.collection.insert_one(self.make_document(path,False))
            return None
        else:
            # 更新数据库路径文档
            did = document_in_db['_id']
            document_in_db.pop('_id')
            document_in_db.pop('children_id')
            document_in_db['last_modified'] = document_in_db['last_modified'].ctime()

            difference = dict(list(self.make_document(path,True).items() - document_in_db.items()))
            if 'last_modified' in difference:
                difference['last_modified'] = path.parser.last_modified
            if len(difference) > 0:
                self.collection.find_one_and_update({'_id':did},{'$set':difference})

            return did

    def traverse_and_update(self):
        """ 遍历更新路径文档的子目录信息,即childre_id。
        这里分两个步骤,其一是遍历数据库,通过原文档的父路径id,查询父路径文档,并把源文档的id添加到父路径文档的children_id中;
        其二是遍历数据库,通过源文档的子路径id列表,查询子路径文档,并验证该子路径是否存在

        :return: 无返回值
        """
        # 遍历数据库
        for record in self.collection.find({}):
            # 获得单个记录的父路径id
            record_parent_path_id = record['parent_path_id']
            if record_parent_path_id is not None:
                # 查询父路径id指向的记录,则指向的记录是当前记录的父路径记录,那么该父路径的子路径集中必须有当前记录信息
                children_id = set(self.collection.find_one({'_id':record_parent_path_id})['children_id'])
                if record['_id'] not in children_id:
                    self.collection.update_one({'_id':record_parent_path_id},
                                               {'$addToSet':{'children_id':record.get('_id')}},upsert =True)

        # 遍历数据库
        for record in self.collection.find({}):
            # 如果存在子目录集合,那么查询每个子目录是否存在
            if len(record['children_id']) > 0:
                for pid in record['children_id']:
                    path_found = self.collection.find_one({'_id':pid})
                    if path_found is None:
                        self.collection.update_one({'_id':record['_id']},
                                                   {'$pull':{'children_id':pid}})
                        continue
                    if path_found['parent_path_id'] != record['_id']:
                        self.collection.update_one({'_id':record['_id']},
                                                   {'$pull':{'children_id':pid}})

    def make_document(self,path,for_comparison=False):
        """ 根据path创建标准格式的数据库pathdb集合中的文档

        :param path:
        :param for_comparision:
        :return:
        """
        # 数据库中文档形式如下:
        # {
        #    path: 相对路径
        #    path_name: 当前路径名称
        #    parent_path_id: 相对父路径
        #    last_modified: 最近修改时间
        #    children_id: 子目录列表
        # }
        if for_comparison:
            # 根目录是.,如果是根目录,需要设置parent_path_id为None
            if re.match('^\.$',path.relative_path) is not None:
                document = {'path':path.relative_path,
                            'path_name':path.relative_path,
                            'parent_path_id':None,
                            'last_modified':path.parser.last_modified.ctime()}
            else:
                parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id']
                document = {'path':path.relative_path,
                            'path_name': path.current_path,
                            'parent_path_id':parent_id,
                            'last_modified':path.parser.last_modified.ctime()}
        else:
            # 根目录是.,如果是根目录,需要设置parent_path_id为None
            if re.match('^\.$',path.relative_path) is not None:
                document = {'path':path.relative_path,
                            'path_name':path.relative_path,
                            'parent_path_id':None,
                            'last_modified':path.parser.last_modified,
                            'children_id':[]}
            else:
                parent_id = self.collection.find_one({'path':path.relatvie_parent_path})['_id']
                document = {'path':path.relative_path,
                            'path_name': path.current_path,
                            'parent_path_id':parent_id,
                            'last_modified':path.parser.last_modified,
                            'children_id':[]}

        return document

    def _raw_path_tree(self,root='.'):
        """ 辅助函数,根据数据库中的信息,返回目录树

        :param str root: 起始路径
        :return: 路径树
        :rtype: list
        """
        result = []
        # 查询起始路径
        root_path = self.collection.find_one({'path':root})
        # 查询子路径集合
        child_path = list(self.collection.find({'parent_path_id':root_path['_id']}))
        # 如果没有子路径,则返回当前路径的集合
        if len(child_path) < 1:
            return [root_path['path']]
        # 否则,添加子路径及以子路径为起始路径的下属路径
        else:
            for item in child_path:
                result.append(item['path'])
                result.extend(self._raw_path_tree(item['path']))
            return result

    def path_tree(self,root='.'):
        """ 根据数据库中的信息,返回目录树

        :param str root: 起始路径
        :return: 路径树
        :rtype: list
        """
        # 这里result_set的用途是排除重合的路径
        result_set = set()
        raw_result = self._raw_path_tree(root)
        result = []
        for item in raw_result:
            if item not in result_set:
                result.append(item)
                result_set.add(item)

        return result

    def delete_many(self,ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({'_id':ObjectId(item)})

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__path_db.close()
Пример #17
0
class ElisePaper:
    def __init__(self):
        self.mongo = MongoDB()
        province_economy = Excel('E:\\data\\college\\province.xlsx').read()
        self.province_economy_dict = dict([(str(int(item[0])),item[2]) for item in province_economy[1:]])
        self.mongo.connect('region','province')
        self.province_dict = dict([(item['acode'],item['region']) for item in self.mongo.collection.find({})])

    def get_data_by_region(self,acode,subject='理科'):
        mresult = [['学校','类型','录取平均分','学校地区','学校评分','地区收入']]
        self.mongo.connect('college','entranceexam')
        query_result = self.mongo.collection.find({'student_region':acode,'subject':subject})
        self.mongo.connect('college','collegeinfo')
        for item in query_result:
            rating = self.mongo.collection.find_one({'name':item['university']})['rating']
            income = self.province_economy_dict[item['university_region']]
            mresult.append([item['university'],item['type'],item['average_score'],item['university_region'],
                            rating,income])
        return mresult

    def get_college_data(self):
        self.mongo.connect('college','entranceexam')
        colleges = self.mongo.collection.find({}).distinct('university')
        self.mongo.connect('college','collegeinfo')
        records = self.mongo.collection.find({'name':{'$in':colleges}},
                                             projection={'_id':0,'name':1,'rating':1,'score':1,'region':1,'project':1,'type':1})

        result = [record for record in records if len(record['region']) < 2]
        result = dict([(item['rating'],item) for item in result])
        mresult = [['排名','学校','类型','评分','地区','地区可支配收入']]
        for item in sorted(result):
            region = result[item]['region'][0]
            region_income = self.province_economy_dict[region]
            mresult.append([result[item]['rating'],result[item]['name'],
                            result[item]['type'],result[item]['score'],
                            region,region_income])
        return mresult


    def close(self):
        self.mongo.close()
Пример #18
0
 def __init__(self):
     self.mongo = MongoDB()
     province_economy = Excel('E:\\data\\college\\province.xlsx').read()
     self.province_economy_dict = dict([(str(int(item[0])),item[2]) for item in province_economy[1:]])
     self.mongo.connect('region','province')
     self.province_dict = dict([(item['acode'],item['region']) for item in self.mongo.collection.find({})])
Пример #19
0
 def __init__(self):
     # 初始化数据库集合
     self.__tag_db = MongoDB()
     self.__tag_db.connect('FileIndex', 'tagdb')
     self.collection = self.__tag_db.collection
Пример #20
0
# coding=UTF-8

# ============================
# @app: 检验火车站点间是否有直通车
# @author: glen
# @date: 2017.1.8
# ============================

import pickle
from libs.database.class_mongodb import MongoDB, MonDatabase, MonCollection
from libs.application.train.class_trainscraper import TrainStationScraper, TrainTicketLeftScraper
from libs.application.train.class_trainscraper import StationPairsGenerator, StationPairValidator

# 0. 初始化
train_db = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
train_station_collection = MonCollection(database=MonDatabase(mongodb=train_db, database_name='train'),
                                         collection_name='stations')
day = '2017-01-10'

DOWNLOAD = False
LOAD = True
FILE_NAME = 'station_pairs.pkl'

# 1. 爬取站点名,并且储存所有站点对进入数据库
if DOWNLOAD:
    F = open(FILE_NAME, 'wb')
    Stations = TrainStationScraper().scrape()
    All_Station_Pairs = list(StationPairsGenerator(stations=Stations)())

    pickle.dump(All_Station_Pairs, F)
    F.close()
Пример #21
0
class FileDB:
    """ FileDB类连接filedb数据库

    """

    def __init__(self):
        # 初始化数据库集合
        self.__file_db = MongoDB()
        self.__file_db.connect("FileIndex", "filedb")
        self.collection = self.__file_db.collection
        # pathdb数据集合
        self.__path_db = PathDB()

    def update(self, file, path):
        """ 根据file,更新文件库信息

        :param file: 文件对象
        :param path: 路径对象
        :return: 返回当前文件在数据库中的id,如果数据库中无当前文件信息,返回None
        :rtype: bson.objectid.ObjectId对象
        """
        # 获得数据库中该文件的路径信息
        path_found = self.__path_db.collection.find_one({"path": path.relative_path})
        if path_found is None:
            print("Path wrong!")
            raise FileNotFoundError
        # 获得文件的信息
        file_found = self.collection.find_one(
            {
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                )
            }
        )
        # 如果数据库里没有相关文件信息,则插入此文件信息;若有,比较两者是否一致。
        if file_found is not None:
            # 变量fid是数据库中相关文件信息中的_id
            fid = file_found["_id"]
            # 目录中文件信息与数据库相关文件信息的差异
            difference = dict(
                list(
                    self.make_document(file, path, path_found, True).items()
                    - self.make_document_from_db(file_found).items()
                )
            )
            # 更改tags和projects的格式
            if "tags" in difference:
                difference["tags"] = re.split("\|", difference["tags"])
            if "projects" in difference:
                difference["projects"] = re.split("\|", difference["projects"])
            if "last_modified" in difference:
                difference["last_modified"] = file.parser.last_modified
            if "time" in difference:
                difference["time"] = file.parser.time
            # 若存在差异,则更新数据库中的信息
            if len(difference) > 0:
                self.collection.insert_one(self.make_document(file, path, path_found, False))
                return None
            else:
                # self.collection.find_one_and_update({'_id':fid},{'$set':difference})
                return fid
        else:
            # 若数据库中无此文件信息,那么插入此信息
            self.collection.insert_one(self.make_document(file, path, path_found, False))
            return None

    @classmethod
    def make_document(cls, file, path, path_found, for_comparison=False):
        """ 根据file,path以及数据库中查询得到的path_found,创建符合数据库filedb集合标准格式的文档

        :param File file: 文件对象
        :param Path path: 路径对象
        :param dict path_found: 数据库中查询得到的路径文档
        :param bool for_comparison: 是否是用来进行比较
        :return: 数据文档
        :rtype: dict
        """
        if for_comparison:
            document = {
                "full_file_name": os.path.join(path.relative_path, file.parser.path_name),
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                ),
                "special_characters": file.parser.special_character_part,
                "file_name": file.parser.path_name,
                "directory": path_found["_id"],
                "extension": file.parser.extension,
                "last_modified": file.parser.last_modified.ctime(),
                "size": len(file),
                "author": file.parser.author,
                "version": file.parser.version,
            }
            if file.parser.time is not None:
                document["time"] = file.parser.time.ctime()
            else:
                document["time"] = None
            if file.parser.tags is not None:
                document["tags"] = "|".join(file.parser.tags)
            else:
                document["tags"] = None

            if file.parser.projects is not None:
                document["projects"] = "|".join(file.parser.projects)
            else:
                document["projects"] = None
        else:
            document = {
                "full_file_name": os.path.join(path.relative_path, file.parser.path_name),
                "full_file_name_without_sc": os.path.join(
                    path.relative_path, file.parser.path_name_without_special_characters
                ),
                "special_characters": file.parser.special_character_part,
                "file_name": file.parser.path_name,
                "directory": path_found["_id"],
                "extension": file.parser.extension,
                "last_modified": file.parser.last_modified,
                "size": len(file),
                "author": file.parser.author,
                "time": file.parser.time,
                "version": file.parser.version,
                "tags": file.parser.tags,
                "projects": file.parser.projects,
            }

        return document

    @classmethod
    def make_document_from_db(cls, file):
        """ 根据数据库中的文件文档对象创建新文档,进行比较

        :param dict file: 文件对象
        :return: 文档
        :rtype: dict
        """
        record = file
        record.pop("_id")
        record["last_modified"] = record["last_modified"].ctime()

        if record["time"] is not None:
            record["time"] = record["time"].ctime()
        else:
            record["time"] = None

        if record["tags"] is not None:
            record["tags"] = "|".join(record["tags"])
        else:
            record["tags"] = None

        if record["projects"] is not None:
            record["projects"] = "|".join(record["projects"])
        else:
            record["projects"] = None
        return record

    def delete_many(self, ids):
        """ 根据id删除数据库中的文档

        :param ids: 数据库中文档的_id列表
        :return: 无返回值
        """
        for item in ids:
            self.collection.delete_one({"_id": ObjectId(item)})

    def make_tag_document(self):
        """ 根据filedb数据库中的信息,生成tagdb中所需要的标准格式文档

        :return: 标签文档
        :rtype: defaultdict对象
        """
        tags = defaultdict(list)
        tag_items = self.collection.find({}, {"_id": 1, "tags": 1})
        for item in tag_items:
            for tag in item["tags"]:
                tags[tag].append(item["_id"])
        return tags

    def get_files_according_to_path_list(self, path_list):
        """ 根据目录列表,补充文件信息

        :param list path_list: 目录列表
        :return: 完整的目录文件字典
        :rtype: OrderedDict对象
        """
        result = OrderedDict()
        for path in path_list:
            path_found = self.__path_db.collection.find_one({"path": path})
            files_found = self.collection.find({"directory": path_found["_id"]})
            result[path] = [file["file_name"] for file in files_found]
        return result

    def find_and_open(self, base_path, temp_path, **condition):
        """ 根据condition条件查询filedb数据库,复制查询得到的文件到temp_path,并且打开temp_path文件窗口

        :param str base_path:
        :param str temp_path:
        :param dict condition:
        :return: 返回成功或失败信息
        :rtype: str
        """
        result = list(self.collection.find(condition))
        if len(result) < 1:
            return "None is found!"
        else:
            for item in result:
                source_file = os.path.join(base_path, item["full_file_name"])
                destination_file = os.path.join(temp_path, item["file_name"])
                OSOperator.copy_to(source_file, destination_file)
            os.startfile(temp_path)
            return "successfully!"

    def close(self):
        """ 关闭数据库连接

        :return: 无返回值
        """
        self.__file_db.close()
Пример #22
0
# coding=UTF-8

from libs.database.class_mongodb import MongoDB
from libs.file.class_Excel import Excel

mongo = MongoDB()
mongo.connect('paper','literature')

result = mongo.collection.find({}).distinct('journal')
print(result)
''''
result = mongo.collection.find({'journal':'Journal of Econometrics',
                                'keyword':{'$exists':True}})

keyword = set()
for item in result:
    keyword.update([item.lower() for item in item['keyword']])

mdata = [[item] for item in sorted(keyword)]

outfile = r'd:\down\keywords.xlsx'
moutexcel = Excel(outfile)
moutexcel.new().append(mdata, 'mysheet')
moutexcel.close()'''

result = mongo.collection.find({'keyword':'ARMA'})
result = mongo.collection.find({'keyword':{'$regex':'^(a|A)utocorrelation$'}})
for item in result:
    print(item['keyword'])
Пример #23
0
        if re.match('^复合影响因子$',name) is not None:
            a_journal[name] = float(value)
            continue

        a_journal[name] = re.sub('\s+','',value)

for j in journal:
    print(j)

out_file = r'E:\gitrobot\files\literature\jjournals_cssci.txt'
json.dump(journal, fp=open(out_file,'w'))
browser.quit()

'''

mongo = MongoDB()
mongo.connect('publication','ChineseJournal')
'''
literatures = json.load(open(r'E:\gitrobot\files\literature\journals_cssci.txt'))
for l in literatures:
    print(l)
    #mongo.collection.insert_one(l)
print(len(literatures))'''

proxy_list = ['101.26.38.162:82']
proxy_list = ['111.56.13.152:80', '101.26.38.162:80', '101.26.38.162:82', '111.56.13.150:80', '60.191.157.155:3128', '60.191.175.54:3128', '60.191.167.93:3128', '61.163.32.6:3128', '49.1.244.139:3128', '112.16.76.188:8080', '60.191.163.147:3128', '60.194.100.51:80', '101.226.12.223:80', '82.200.81.233:80', '85.143.24.70:80', '59.58.162.141:888', '110.18.241.9:3128', '60.15.41.214:3128', '61.7.149.69:8080', '61.184.199.203:3128', '86.100.118.44:81', '61.150.89.67:3128', '61.162.223.41:9797', '95.168.217.24:3128', '86.100.118.44:80', '31.173.74.73:8080', '58.248.137.228:80', '79.120.72.222:3128', '46.218.85.101:3129', '106.56.225.200:3128', '60.15.55.228:3128', '60.13.74.184:81', '101.200.234.114:8080', '104.238.83.28:443', '91.183.124.41:80', '60.191.164.22:3128', '62.204.241.146:8000', '60.191.174.227:3128', '60.191.153.12:3128', '61.53.65.52:3128', '36.250.69.4:80', '61.153.198.178:3128', '60.191.153.75:3128', '60.191.178.43:3128', '60.13.74.184:82', '60.13.74.184:80', '60.191.161.244:3128', '60.191.170.122:3128', '60.191.167.11:3128', '61.175.220.4:3128', '61.164.92.254:9999', '61.75.2.124:3128', '27.122.12.45:3128', '64.62.233.67:80', '113.140.43.51:3128', '60.191.166.130:3128', '113.107.57.76:8101', '113.107.57.76:80', '60.191.160.20:3128', '61.134.34.148:3128', '93.51.247.104:80', '60.191.164.59:3128', '91.142.84.182:3128', '72.252.11.91:8080', '59.44.244.14:9797', '58.18.50.10:3128', '58.96.187.208:3128', '85.194.75.18:8080', '113.105.80.61:3128', '58.59.141.187:3128', '61.163.45.240:3128', '91.108.131.250:8080', '110.17.172.150:3128']
#browser = AutoBrowser(proxy=proxy_list[random.randint(0,len(proxy_list)-1)])
#browser = AutoBrowser(proxy='101.26.38.162:82')
browser = AutoBrowser()
browser.surf('http://navi.cnki.net/knavi/journal/Detailq/CJFD/JJYJ?Year=&Issue=&Entry=',
             ready_check=(By.CSS_SELECTOR,'#bottom'))
Пример #24
0
# coding=UTF-8

from libs.database.class_mongodb import MongoDB
from collections import deque

# 1. 连接数据库集合
path_db, file_db, tag_db = MongoDB(), MongoDB(), MongoDB()

path_db.connect('FileIndex','pathdb')
file_db.connect('FileIndex','filedb')
tag_db.connect('FileIndex','tagdb')

# 2. 返回文件夹树
paths = deque(path_db.collection.find({'path':'.'}))
print(paths)

while paths:
    newpath = paths.pop()
    print(newpath['path'])
    paths.extend([path_db.collection.find_one({'_id':item}) for item in reversed(newpath['children_id'])])










Пример #25
0
from applications.literature.class_cnki import Cnki
from libs.database.class_mongodb import MongoDB
from libs.latex.class_article import Article

# 1. 配置初始参数
PROXY_LIST = ['58.20.128.123:80', '36.7.151.29:8000', '61.174.13.12:80',
              '112.90.179.153:4040', '58.20.235.180:8000', '58.22.86.44:8000',
              '101.226.249.237:80', '111.1.89.254:80', '112.16.87.24:80']
PROXY_LIST = ['111.56.13.150:80', '115.159.5.247:8080', '117.136.234.6:843',
              '60.191.179.53:3128', '60.191.163.235:3128', '120.52.73.33:80']
QUERY_STRING = "JN='经济研究'"
START_PERIOD = "2010"
END_PERIOD = "2016"
SUBJECTS = ["经济与管理科学","社会科学Ⅱ辑"]
LITERATURE_JSON_FILE = r"E:\gitrobot\files\literature\literature_list.txt"
db = MongoDB()
db.connect('publication','ChineseJournal')
journals = db.collection.find({},projection={'_id':0,'中文名称':1,'复合影响因子':1})
jours = dict([(journal['中文名称'],journal.get('复合影响因子')) for journal in journals])
jours_set = jours.keys()

STEP_ONE = False
STEP_TWO = True

# 2. 进行CNKI网站操作
if STEP_ONE:
    cnki_obj = Cnki()
    #cnki_obj = Cnki(PROXY_LIST[random.randint(0,len(PROXY_LIST)-1)])
    cnki_obj.set_query(QUERY_STRING)
    cnki_obj.set_period(start_period=START_PERIOD,end_period=END_PERIOD)
    cnki_obj.set_subject(subjects=SUBJECTS)