Пример #1
0
def get_many():
    #  http://127.0.0.1:5000/many?count=2
    # args = flask.request.args  # 参数提交
    proxies = MongoDB().get(1)
    result = [proxy['proxy'] for proxy in proxies]
    print(result)
    print(MongoDB().get_count())
    # x = random.randint(1,MongoDB().get_count()-1)
    res_dict = {'result': result}
    return jsonify(res_dict)
Пример #2
0
def check():
    '''
    定时检测数据库中代理的可用性
    :return:
    '''
    while True:
        m = MongoDB()
        count = m.get_count()
        if not count == 0:
            logging.info('开始检测数据库中代理可用性>>>>>>>>')
            proxies = m.get(count)
            Validate().valid_many(proxies, 'check')
        time.sleep(10 * 60)
Пример #3
0
    def __init__(self):
        file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
        #self.logger = logger#logging.getLogger('weather.spider')
        self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
        self.db = MongoDB(
            auth=True,
            host='localhost',
            user='******',
            password='******',
            authSource='admin',
            authMechanism='SCRAM-SHA-1')

        self.db.remove('weather', 'wea', {})
        super(ProvinceSpider, self).__init__()
class MongoTwitterConsumer:
    def __init__(self, collection_name, delay=5):
        self.db = MongoDB()
        self.kinesis = boto3.client("kinesis")
        self.collection_name = collection_name
        self.delay = delay
        self.shard_id = "shardId-000000000000"  #only one shard!

    def run(self, stream_name):
        print(
            f'Starting MongoDB consumer, db: {DB_NAME}, collection: {self.collection_name}'
        )

        # Connect to db. This must happen inside process otherwise there can be a problem with
        # locking: http://api.mongodb.com/python/current/faq.html#multiprocessing.
        self.db.connect(DB_NAME)

        pre_shard_it = self.kinesis.get_shard_iterator(
            StreamName=stream_name,
            ShardId=self.shard_id,
            ShardIteratorType="LATEST")
        shard_it = pre_shard_it["ShardIterator"]

        while True:
            out = self.kinesis.get_records(ShardIterator=shard_it, Limit=1)
            shard_it = out["NextShardIterator"]
            if len(out['Records']) > 0:
                for rec in out['Records']:
                    print('Processing: ', rec['SequenceNumber'])
                    bytes_data = rec['Data']
                    json_obj = json.loads(bytes_data.decode('utf8'))
                    json_obj['tweet_id'] = json_obj['id']
                    del json_obj['id']
                    self.db.add_document(self.collection_name, json_obj)
            time.sleep(self.delay)

    def start(self, stream_name):
        print(stream_name, flush=True)
        self.process = Process(target=self.run, args=(stream_name, ))
        self.process.start()

    def stop(self):
        print('Stopping consumer thread.')
        self.process.terminate()
Пример #5
0
    def valid_one(self, proxy, method, url='https://baidu.com'):

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'
        }
        # proxies = {
        #     'http': 'http://' + proxy['proxy'],
        #     'https': 'http://' + proxy['proxy']
        # }
        proxies = {'http': proxy['proxy'], 'https': proxy['proxy']}

        try:
            start_time = time.time()
            resp = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=8)
            delay = round(time.time() - start_time, 2)  # round()方法返回浮点数x的四舍五入值
            if resp.status_code == 200:
                proxy['delay'] = delay
                if method == 'insert':
                    MongoDB().insert(proxy)
                elif method == 'check':
                    MongoDB().update({'proxy': proxy['proxy']},
                                     {'delay': proxy['delay']})

            else:
                logging.info(f'无效ip: {proxy}')
                if method == 'check':
                    MongoDB().delete({'proxy': proxy['proxy']})
        except (ProxyError, ConnectTimeout):
            logging.info(f'无效ip: {proxy}')
            if method == 'check':
                MongoDB().delete({'proxy': proxy['proxy']})

        except:
            pass
Пример #6
0
    def test_same_lastname(self):
        """ Positive test, males' last names are the same"""
        mongo_instance = MongoDB()
        mongo_instance.drop_collection("family")
        mongo_instance.drop_collection("individual")

        ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_same.ged')
        ged.insert_to_mongo()

        self.assertEqual(ged.us16_male_last_name(debug=True), [])
Пример #7
0
    def test_diff_lastname(self):
        """ Negative test, males' last names are different"""
        mongo_instance = MongoDB()
        mongo_instance.drop_collection("family")
        mongo_instance.drop_collection("individual")

        ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_diff.ged')
        ged.insert_to_mongo()

        self.assertEqual(ged.us16_male_last_name(debug=True),
                         [('@F1@', '@I2@, @I3@', 'LastName,Test')])
Пример #8
0
def get_one():
    proxies = MongoDB().get(1)
    result = [proxy['proxy'] for proxy in proxies]
    x = random.randint(0, MongoDB().get_count() - 1)
    return jsonify(dict(proxy=result[x]))
Пример #9
0
def delete():
    args = request.args
    MongoDB().delete({'proxy': args['proxy']})
    return '删除成功:{}'.format(args)
Пример #10
0
class ProvinceSpider(scrapy.Spider):
    '''
        get province
    '''
    name = 'province_spider'
    allowed_domains = ['weather.com.cn']
    start_urls = ['http://www.weather.com.cn/province/']
    
    def __init__(self):
        file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
        #self.logger = logger#logging.getLogger('weather.spider')
        self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
        self.db = MongoDB(
            auth=True,
            host='localhost',
            user='******',
            password='******',
            authSource='admin',
            authMechanism='SCRAM-SHA-1')

        self.db.remove('weather', 'wea', {})
        super(ProvinceSpider, self).__init__()

    def parse(self, response):
        '''
            解析省
        '''
        provinces = []
        for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
            name = li.xpath('.//text()').extract_first()
            if name not in constant.PIG_ZONE:
                provinces.append({
                    'url': li.xpath('a/@href').extract_first(),
                    'province': name
                })
        for p in provinces:
            yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)

    def parse_city(self, response):
        '''
            解析市/区
        '''
        # 上级省/直辖市
        province_info = response.meta

        cities = []
        for a in response.xpath('//div[@class="navbox"]/span/a'):
            cities.append({
                'url': response.urljoin(a.xpath('@href').extract_first()),
                'city': a.xpath('.//text()').extract_first()
            })
        # shirt, 广东省的主页样式不一样
        if not cities:
            for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
                cities.append({
                    'url': response.urljoin(a.xpath('./a/@href').extract_first()),
                    'city': a.xpath('./a/text()').extract_first()
                })
        for c in cities:
            yield scrapy.Request(c['url'], callback=self.parse_county, meta={
                'province': province_info['province'],
                'city': c['city']
            })
        
        
    def parse_county(self, response):
        '''
            解析县
        '''
        city_info = response.meta

        # 如果是直辖市, 没有下级县, 直接解析天气数据
        if city_info['province'] in constant.DIRECT_CITY:
            self.parse_direct_weather(response, city_info)
        
        else:
            counties = []
            for a in response.xpath('//div[@class="navbox"]/span/a'):
                counties.append({
                    'url': response.urljoin(a.xpath('@href').extract_first()),
                    'county': a.xpath('.//text()').extract_first()
                })
            for c in counties:
                city_info['county'] = c['county']
                yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
        
    def parse_county_weather(self, response):
        '''
            解析县天气数据
        '''
        meta = response.meta
        self._parse_weather(response, meta)


    def parse_direct_weather(self, response, meta):
        '''
            解析直辖市天气数据
        '''
        #self.log.info('provicince:%s, city:%s', meta['province'], meta['city'])
        self._parse_weather(response, meta)


    def _parse_weather(self, response, meta):
        seven_day_weather = []
        for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
            # 相对日期
            h1 = li.xpath('./h1/text()').extract_first()
            # 描述
            desc = li.xpath('./p[@class="wea"]/text()').extract_first()
            # 最高、低温度
            max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
            min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
            # 风向
            wind_direction = li.xpath('.//em/span/@title').extract()
            # 风力 可能会有隐患
            wf = li.xpath('.//i/text()').extract()
            wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'

            seven_day_weather.append({
                'day': h1,
                'desc': desc,
                'max_tem': max_tem,
                'min_tem': min_tem,
                'wind_direction': wind_direction,
                'wind_force': wind_force
            })
        self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))

        data = {
            'province': meta['province'],
            'city': meta['city'],
            'county': meta.get('county', None),
            'data': seven_day_weather
        }
        self.db.insert('weather', 'wea', data)
 def __init__(self, collection_name, delay=5):
     self.db = MongoDB()
     self.kinesis = boto3.client("kinesis")
     self.collection_name = collection_name
     self.delay = delay
     self.shard_id = "shardId-000000000000"  #only one shard!
Пример #12
0
""" US26: Less than 150 years old
    Benji, Feb 24th, 2019
    Death should be less than 150 years after birth for dead people, and
    current date should be less than 150 years after birth for all living people
"""

import os
import unittest
from gedcom_ajry import Gedcom
from mongo_db import MongoDB

MONGO = MongoDB()


class test_us26(unittest.TestCase):
    """ Test cases for US26"""
    def test_indi_entry_bleach(self):
        """ Individual data missed in family collection."""
        self.assertEqual(
            Gedcom('GEDCOM_files/us26/us26_indi_entry_bleach.ged').
            us26_corrspnding_entries(debug=True), [('Individual', '@I4@')])

    def test_no_err(self):
        """ Positive test for US26."""
        self.assertEqual(
            Gedcom('GEDCOM_files/us26/us26_no_err.ged').
            us26_corrspnding_entries(debug=True), [])


if __name__ == '__main__':
    unittest.main(exit=False, verbosity=2)