Пример #1
0
 def __init__(self):
     # 设置数据库
     mongo = MongoDB(
         conn_str='mongodb://*****:*****@123.207.185.126:27017/')
     mdb = MonDatabase(mongodb=mongo, database_name='proxy')
     self._collection = MonCollection(database=mdb,
                                      collection_name='proxys')
Пример #2
0
    def __init__(self):
        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self._mdb = MonDatabase(mongodb=mongo, database_name='statsgov')

        # variable query website
        self._var_query_web = 'http://data.stats.gov.cn/adv.htm'
        self._var_query_web_params = {'m': 'findZbXl', 'wd': 'zb'}

        self._tags = {'年度全国':'hgnd', '年度地区':'fsnd'}
Пример #3
0
    def __init__(self):
        #设置代理服务器
        self._proxy_manager = ProxyManager()

        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        self._mdb = MonDatabase(mongodb=mongo, database_name='statsgov')

        self._tags = {'年度全国': 'hgnd', '年度地区': 'fsnd'}
        self._stats_gov_url_template = 'http://data.stats.gov.cn/easyquery.htm?m={}&dbcode={}&rowcode={}&' \
                                       'colcode={}&wds={}&dfwds={}&k1=14930450350'
Пример #4
0
    def __init__(self, website=None, label=None):
        self.pmanager = ProxyManager()

        # 设置网站地址
        self.website = website

        # 设置数据库
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name='local')
        self.db = MonCollection(database=mdb, collection_name='scraper')

        # 设置标示
        self.label = label
Пример #5
0
    def __init__(self):
        # 设置数据库
        mongo = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name='proxy')
        self._collection = MonCollection(database=mdb, collection_name='proxy')

        # 验证的网址
        self._checked_websites = [{'address':'http://www.163.com', 'title':'网易'},
                                  {'address':'http://www.sina.com.cn', 'title':'新浪首页'},
                                  {'address':'https://www.douban.com/', 'title':'豆瓣'},
                                  {'address':'http://www.sohu.com/', 'title':'搜狐'},
                                  {'address':'http://www.eastday.com/', 'title':'东方网'},
                                  {'address':'http://www.shanghaiairport.com/', 'title':'上海机场(集团)有限公司'}]

        # 设置检验完的代理服务器列表
        self._checked_proxy_list = dict()
Пример #6
0
    def insert_to_db(self,
                     literatures=None,
                     database='papers',
                     collection='cnki',
                     condition=None):
        mongo = MongoDB(
            conn_str='mongodb://*****:*****@123.207.185.126:27017/')
        mdb = MonDatabase(mongodb=mongo, database_name=database)
        conn = MonCollection(database=mdb, collection_name=collection)

        for record in literatures:
            print(record['title'], record['journal'], record['year'],
                  record['issue'], record.get('pages'))
            if record.get('author') is None:
                print('No author!')
                continue
            if condition is not None:
                if not condition[1] == record.get(condition[0]):
                    print('Journal not matched!->', condition[1],
                          record.get(condition[0]))
                    continue

            result = conn.collection.find_one({
                'title': record.get('title'),
                'journal': record.get('journal'),
                'year': record.get('year'),
                'issue': record.get('issue'),
                'pages': record.get('pages')
            })
            if result is None:
                print('Insert...!')
                conn.collection.insert_one(record)
            else:
                print('Update...!')
                conn.collection.update_one(
                    {
                        'title': record.get('title'),
                        'journal': record.get('journal'),
                        'year': record.get('year'),
                        'issue': record.get('issue'),
                        'pages': record.get('pages')
                    }, {
                        '$set': {
                            'cite': record.get('cite'),
                            'download': record.get('download')
                        }
                    })
Пример #7
0
# coding=UTF-8

# ============================
# @app: 检验火车站点间是否有直通车
# @author: glen
# @date: 2017.1.8
# ============================

import pickle
from libs.database.class_mongodb import MongoDB, MonDatabase, MonCollection
from libs.application.train.class_trainscraper import TrainStationScraper, TrainTicketLeftScraper
from libs.application.train.class_trainscraper import StationPairsGenerator, StationPairValidator

# 0. 初始化
train_db = MongoDB(conn_str='mongodb://*****:*****@123.207.185.126:27017/')
train_station_collection = MonCollection(database=MonDatabase(mongodb=train_db, database_name='train'),
                                         collection_name='stations')
day = '2017-01-10'

DOWNLOAD = False
LOAD = True
FILE_NAME = 'station_pairs.pkl'

# 1. 爬取站点名,并且储存所有站点对进入数据库
if DOWNLOAD:
    F = open(FILE_NAME, 'wb')
    Stations = TrainStationScraper().scrape()
    All_Station_Pairs = list(StationPairsGenerator(stations=Stations)())

    pickle.dump(All_Station_Pairs, F)
    F.close()