class ProxyManager:
    """ ProxyManager类用来管理、检验和更新代理服务器列表
    :param str proxy_web: proxy的地址,默认为http://www.youdaili.net/Daili/guonei/
    :return: 无返回值
    """
    def __init__(self, database='proxy', collection_name='proxys'):
        # 设置数据库
        self._conn = MonCollection(mongodb=MongoDB(),
                                   database=database,
                                   collection_name=collection_name)

    def find(self, type=0, limit=None):
        if type == 0:
            found = self._conn.find(filter={'protocol': {
                '$in': [0, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        else:
            found = self._conn.find(filter={'protocol': {
                '$in': [1, 2]
            }},
                                    projection={
                                        '_id': False,
                                        'ip': True,
                                        'port': True,
                                        'type': True
                                    },
                                    sort=[('score', DESCENDING),
                                          ('speed', ASCENDING)],
                                    limit=limit)
        return {
            Proxy(ip=item['ip'], port=item['port'], type=type).address
            for item in found
        }

    @property
    def random_proxy(self):
        """ 随机返回一个代理服务器,选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return random.choice(list(self.find(limit=100)))

    @property
    def top_150_proxies(self):
        """ 随机返回一个代理服务器,选择的权重是它的count

        :return: 随机返回一个代理服务器
        """
        return list(self.find(limit=150))
class CityStatisticsDatabase:
    def __init__(self):
        """ 初始化中国城市统计数据库接口

        """
        mongo = MongoDB(conn_str='localhost:27017')
        self.conn = MonCollection(mongo,
                                  database='regiondata',
                                  collection_name='citystatistics').collection

    def find(self, *args, **kwargs):
        """ 调用查询接口

        :param args:
        :param kwargs:
        :return:
        """
        found = list(self.conn.find(*args, **kwargs))
        if len(found) > 0:
            found_data = pd.DataFrame(found)
            found_data['var'] = found_data['variable'] + found_data[
                'unit'].apply(lambda x: ''.join(['(', x, ')']))
            pdata = pd.pivot_table(found_data,
                                   values='value',
                                   index=['year', 'acode', 'region'],
                                   columns=['var'])
            pdata = pdata.swaplevel(0, 1, axis=0)
            return pdata

    @property
    def variables(self):
        found = self.conn.find().distinct('variable')
        return pd.DataFrame(sorted(found))

    @property
    def regions(self):
        return None
示例#3
0
class AdminDivisionDatabase():
    """ 类AdminDivisionDatabase连接admindivision集合

    """
    def __init__(self):
        # 连接admindivision集合
        mongo = MongoDB()
        mdb = MonDatabase(mongodb=mongo, database_name='region')
        self.collection = MonCollection(database=mdb,
                                        collection_name='admindivision')

    # 查询
    def find(self, **conds):
        # 设置projection
        projection = conds.get('projection')
        if projection is None:
            projection = {
                'region': 1,
                'year': 1,
                'adminlevel': 1,
                'acode': 1,
                '_id': 1,
                'parent': 1,
                'uid': 1
            }
        else:
            conds.pop('projection')
        # 设置sorts
        sorts = conds.get('sorts')
        if sorts is None:
            sorts = [('year', ASCENDING), ('acode', ASCENDING)]
        else:
            conds.pop('sorts')

        # 设置查询条件
        condition = dict()
        for key in conds:
            if isinstance(conds[key], list):
                condition[key] = {'$in': conds[key]}
            else:
                condition[key] = conds[key]

        # 返回查询结果
        return self.collection.find(condition, projection).sort(sorts)

    # 年份
    @property
    def period(self):
        return sorted(self.find().distinct('year'))
TEMP2 = False
IS_MERGE_CITY_STAT = False
# h. 合并大学创立的年份
IS_MERGE_START_YEAR = False
# i. 添加本地和附近高校的虚拟变量
IS_ADD_LOCAL_VAR = False
IS_ADD_NEARBY_VAR = False
# j. 添加本地的人均实际GDP信息
IS_ADD_LOCAL_PERGDP = True

if IS_EXPORT_RAW_EXAM_SCORE:
    for year in range(2010, 2018):
        found = entrance_score_con.find(
            {
                '年份': year,
                'type': '文科',
                "录取批次": "第一批"
            },
            sort=[('regioncode', ASCENDING), ('university', ASCENDING)])
        raw_dataframe = pd.DataFrame(list(found))
        raw_dataframe.to_excel(
            r'E:\cyberspace\worklot\college\dataset\raw\{}年高考文科第一批录取分数横截面数据.xlsx'
            .format(str(year)))

        found = entrance_score_con.find(
            {
                '年份': year,
                'type': '文科',
                "录取批次": "第二批"
            },
            sort=[('regioncode', ASCENDING), ('university', ASCENDING)])
# coding = UTF-8

import pandas as pd
from lib.base.database.class_mongodb import MongoDB, MonCollection

mongo = MongoDB(conn_str='localhost:27017')
college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection

found = college_info_con.find({"高校性质" : "本科"}, projection={'_id':False, '高校所在地':True, '学校':True})
college_pd = pd.DataFrame(list(found))

college_pd.to_excel(r'E:\cyberspace\worklot\college\colleges.xlsx')

示例#6
0
# coding = UTF-8

import re
import pandas as pd
from pymongo import ASCENDING
from lib.base.database.class_mongodb import MongoDB, MonCollection

# 1. 初始化
mongo = MongoDB(conn_str='localhost:27017')
province_con = MonCollection(mongo,
                             database='regiondata',
                             collection_name='provinces').collection
provinces = province_con.find(projection={
    '_id': False,
    'name': True,
    'id': True
},
                              sort=[('id', ASCENDING)])
province_dict = {province['name']: province['id'] for province in provinces}

perGDP_filepath = r'E:\cyberspace\worklot\college\province_perGDP.xlsx'
perGDP = pd.read_excel(perGDP_filepath)

# 2.匹配省份名称
for ind in perGDP.index:
    region = perGDP.loc[ind, 'region']
    region = re.sub('\s+', '', region)
    for province in province_dict:
        if re.search(region, province) is not None:
            perGDP.loc[ind, 'province'] = province
            perGDP.loc[ind, 'acode'] = province_dict[province]
示例#7
0
                    }


def college_replace(college_name):
    if college_name in college_tranform:
        return college_tranform[college_name]
    else:
        return college_name

# 1. 数据库连接
mongo = MongoDB(conn_str='localhost:27017')
college_info_con = MonCollection(mongo, database='webdata', collection_name='college_info').collection
entrance_score_con = MonCollection(mongo, database='webdata', collection_name='gaokao_entrancescore').collection

# 2. 数据库大学集合
entrance_colleges = entrance_score_con.find().distinct('university')

# 3. 导入校友会大学
college_rate_2011_filepath = r'E:\cyberspace\worklot\college\2011年校友会大学排名.xlsx'
college_2011 = pd.read_excel(college_rate_2011_filepath)
college_2011['学校名称'] = college_2011['学校名称'].apply(college_replace)
for item in college_2011['学校名称']:
    if item not in entrance_colleges:
        print(item)

college_rate_2012_filepath = r'E:\cyberspace\worklot\college\2012年校友会大学排名.xlsx'
college_2012 = pd.read_excel(college_rate_2012_filepath)
college_2012['学校名称'] = college_2012['学校名称'].apply(college_replace)
for item in college_2012['学校名称']:
    if item not in entrance_colleges:
        print(item)
示例#8
0
class CgssDatabase:
    def __init__(self, data_collection=None, label_collection=None):
        """ 初始化数据库连接

        :param data_collection:
        :param label_collection:
        """
        if data_collection is None:
            self._data_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgssdata').collection
        else:
            self._data_collection = data_collection

        if label_collection is None:
            self._label_collection = MonCollection(
                database=MonDatabase(
                    mongodb=MongoDB(conn_str='localhost:27017'),
                    database_name='surveydata'),
                collection_name='cgsslabel').collection
        else:
            self._label_collection = label_collection

    def query(self, year, variables=None):

        if variables is not None:
            projection = {"_id": False}
            for var in variables:
                projection[var] = True

            found = self._data_collection.find({"year": year},
                                               projection=projection)
        else:
            found = self._data_collection.find({"year": year},
                                               projection={
                                                   "_id": False,
                                                   "year": False
                                               })

        pdataframe = iterator2dataframes(found, 2000)
        #found = list(found)
        #pdataframe = pd.DataFrame(found)

        pdataframe = pd.DataFrame(pdataframe, columns=variables)
        pdataframe.index = range(1, pdataframe.shape[0] + 1)

        return {
            "dataframe":
            pdataframe,
            "variable_labels":
            self.get_variable_label_df(year=year, variables=variables),
            "value_labels":
            self.get_variable_value_label_df(year=year, variables=variables)
        }

    def get_variable_label_df(self, year, variables=None):
        var_label_dict = self.get_variable_label(year=year)

        if variables is not None:
            result = pd.DataFrame([(var, var_label_dict[var])
                                   for var in variables],
                                  columns=['variable', 'lable'])
        else:
            result = pd.DataFrame([(var, var_label_dict[var])
                                   for var in var_label_dict],
                                  columns=['variable', 'label'])

        result.index = range(1, result.shape[0] + 1)
        return result

    def get_variable_value_label_df(self, year, variables=None):
        var_value_link = self.get_variable_value_link(year=year)
        value_labels = self.get_value_label(year=year)
        value_label_dataframe = None

        if variables is None:
            variables = [var for var in self.get_variable_label(year=year)]

        for var in variables:
            value_label = var_value_link[var]
            if len(value_label) > 0:
                #print(var_value_link[var], " ---> ", value_labels[value_label])
                if value_label_dataframe is None:
                    value_label_dataframe = pd.DataFrame(
                        [(key, value_labels[value_label][key])
                         for key in value_labels[value_label]],
                        columns=["value", "label"])
                    value_label_dataframe['variable'] = var
                    value_label_dataframe = pd.DataFrame(
                        value_label_dataframe,
                        columns=["variable", "value", "label"])
                else:
                    tmp_dataframe = pd.DataFrame(
                        [(key, value_labels[value_label][key])
                         for key in value_labels[value_label]],
                        columns=["value", "label"])
                    tmp_dataframe['variable'] = var
                    tmp_dataframe = pd.DataFrame(
                        tmp_dataframe, columns=["variable", "value", "label"])
                    value_label_dataframe = pd.concat(
                        [value_label_dataframe, tmp_dataframe])

        return value_label_dataframe

    def get_variable_value_link(self, year):
        """ 返回变量和值标签关联信息

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "variable value lables",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    def get_variable_label(self, year):
        """ 返回某年份的cgss变量

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "variable labels",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    def get_value_label(self, year):
        """ 返回某年份的cgss值标签

        :param year:
        :return:
        """
        return self._label_collection.find(
            {
                "type": "value labels",
                "year": year
            },
            projection={
                "_id": False,
                "type": False,
                "year": False
            })[0]

    @property
    def year(self):
        """ 返回数据库中cgss的时间跨度

        :return:
        """
        return self._data_collection.find().distinct('year')
class GaoKaoWebScraper():
    def __init__(self):
        mongo = MongoDB(conn_str='localhost:27017')
        self._web_conn = MonCollection(mongo,
                                       database='cache',
                                       collection_name='gaokaoweb').collection
        self._data_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaodataweb').collection
        self._university_web_conn = MonCollection(
            mongo, database='cache',
            collection_name='gaokaouniversityweb').collection
        self._data_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokao_entrancescore').collection
        self._copy_data_web_conn = MonCollection(
            mongo, database='webdata',
            collection_name='gaokaouniversityweb').collection

    def init_first_stage(self):
        web_fmt = "http://college.gaokao.com/schpoint/{}/{}/{}/"
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

        for i in range(1, 32):
            for j in range(1, 32):
                url = web_fmt.format(''.join(['a', str(i)]),
                                     ''.join(['b', str(j)]), 'p1')
                raw_result = requests.get(url, headers=headers).text
                bs_obj = BeautifulSoup(raw_result, "lxml")
                for string in bs_obj.select('#qx')[0].strings:
                    total_pages = re.split('页', re.split('/', string)[1])[0]
                    break

                if len(total_pages) > 0:
                    for m in range(1, int(total_pages) + 1):
                        web = web_fmt.format(''.join(['a', str(i)]),
                                             ''.join(['b', str(j)]),
                                             ''.join(['p', str(m)]))
                        record = {'type': 'search', 'url': web}
                        print(record)
                        self._web_conn.insert_one(record)

    def init_second_stage(self):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }
        item = self._web_conn.find({'type': 'search'})
        for aitem in item:
            raw_result = requests.get(aitem['url'], headers=headers).text
            bs_obj = BeautifulSoup(raw_result, "lxml")
            for obj in bs_obj.select('.blue'):
                found = obj.find_all(href=re.compile("result"))
                if len(found) > 0:
                    url = found[0]['href']
                    record = {'type': 'data', 'url': url}
                    print(record)
                    self._data_web_conn.insert_one(record)

    def init_three_stage(self):
        university_urls = self._data_web_conn.find().distinct('url')
        for url in university_urls:
            self._university_web_conn.insert_one({'url': url})

    def scrape(self, using_proxy=False):
        vars = ['年份', '最低', '最高', '平均', '录取人数', '录取批次']
        nums = self._copy_data_web_conn.count()
        while nums > 0:
            urls = [
                item['url'] for item in self._copy_data_web_conn.find(limit=5)
            ]
            print(urls)
            start = time.time()
            scraper = StaticWebScraper(urls=urls, using_proxy=using_proxy)
            scraper.start()

            for html in scraper.result:
                url = html[1]
                bs_obj = BeautifulSoup(html[0], "lxml")
                record = dict(
                    zip(['university', 'region', 'type'], [
                        item.contents[0]
                        for item in bs_obj.select('.btnFsxBox > font')
                    ]))

                htmlparser = HtmlParser(html_content=bs_obj)
                table = htmlparser.table('#pointbyarea > table')
                if len(table) > 0:
                    for item in table:
                        copy_record = copy.copy(record)
                        if len(item) == 0:
                            continue
                        if len(item) == 6:
                            for i in range(len(item)):
                                if i in [0, 1, 2, 3, 4]:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = int(
                                            float(item[i]))
                                else:
                                    if item[i] == '------':
                                        copy_record[vars[i]] = None
                                    else:
                                        copy_record[vars[i]] = item[i]
                        else:
                            raise Exception

                        found = self._data_conn.find_one(copy_record)
                        if found is None:
                            print('Insert..', copy_record)
                            self._data_conn.insert_one(copy_record)
                self._copy_data_web_conn.delete_one({'url': url})

            print('Total: {}'.format(time.time() - start))
            nums = self._copy_data_web_conn.count()
示例#10
0
from lib.base.database.class_mongodb import MongoDB, MonCollection

mongo = MongoDB(conn_str='localhost:27017')
province_con = MonCollection(mongo,
                             database='regiondata',
                             collection_name='provinces').collection
college_info_con = MonCollection(mongo,
                                 database='webdata',
                                 collection_name='college_info').collection
entrance_score_con = MonCollection(
    mongo, database='webdata',
    collection_name='gaokao_entrancescore').collection

provinces = province_con.find(projection={
    '_id': False,
    'name': True,
    'id': True
},
                              sort=[('id', ASCENDING)])
province_dict = {province['name']: province['id'] for province in provinces}

COLLEGE_INFO = False
ENTRANCE_EXAM = True

if COLLEGE_INFO:
    for item in college_info_con.find(projection={
            '_id': True,
            '高校所在地': True,
            '学校': True
    }):
        location = item['高校所在地']
        for province in province_dict: