def get_many(): # http://127.0.0.1:5000/many?count=2 # args = flask.request.args # 参数提交 proxies = MongoDB().get(1) result = [proxy['proxy'] for proxy in proxies] print(result) print(MongoDB().get_count()) # x = random.randint(1,MongoDB().get_count()-1) res_dict = {'result': result} return jsonify(res_dict)
def check(): ''' 定时检测数据库中代理的可用性 :return: ''' while True: m = MongoDB() count = m.get_count() if not count == 0: logging.info('开始检测数据库中代理可用性>>>>>>>>') proxies = m.get(count) Validate().valid_many(proxies, 'check') time.sleep(10 * 60)
def __init__(self): file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log' #self.logger = logger#logging.getLogger('weather.spider') self.log = Logger('province_spider', console=False, file_name=file_name).getLogger() self.db = MongoDB( auth=True, host='localhost', user='******', password='******', authSource='admin', authMechanism='SCRAM-SHA-1') self.db.remove('weather', 'wea', {}) super(ProvinceSpider, self).__init__()
class MongoTwitterConsumer: def __init__(self, collection_name, delay=5): self.db = MongoDB() self.kinesis = boto3.client("kinesis") self.collection_name = collection_name self.delay = delay self.shard_id = "shardId-000000000000" #only one shard! def run(self, stream_name): print( f'Starting MongoDB consumer, db: {DB_NAME}, collection: {self.collection_name}' ) # Connect to db. This must happen inside process otherwise there can be a problem with # locking: http://api.mongodb.com/python/current/faq.html#multiprocessing. self.db.connect(DB_NAME) pre_shard_it = self.kinesis.get_shard_iterator( StreamName=stream_name, ShardId=self.shard_id, ShardIteratorType="LATEST") shard_it = pre_shard_it["ShardIterator"] while True: out = self.kinesis.get_records(ShardIterator=shard_it, Limit=1) shard_it = out["NextShardIterator"] if len(out['Records']) > 0: for rec in out['Records']: print('Processing: ', rec['SequenceNumber']) bytes_data = rec['Data'] json_obj = json.loads(bytes_data.decode('utf8')) json_obj['tweet_id'] = json_obj['id'] del json_obj['id'] self.db.add_document(self.collection_name, json_obj) time.sleep(self.delay) def start(self, stream_name): print(stream_name, flush=True) self.process = Process(target=self.run, args=(stream_name, )) self.process.start() def stop(self): print('Stopping consumer thread.') self.process.terminate()
def valid_one(self, proxy, method, url='https://baidu.com'): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0' } # proxies = { # 'http': 'http://' + proxy['proxy'], # 'https': 'http://' + proxy['proxy'] # } proxies = {'http': proxy['proxy'], 'https': proxy['proxy']} try: start_time = time.time() resp = requests.get(url, headers=headers, proxies=proxies, timeout=8) delay = round(time.time() - start_time, 2) # round()方法返回浮点数x的四舍五入值 if resp.status_code == 200: proxy['delay'] = delay if method == 'insert': MongoDB().insert(proxy) elif method == 'check': MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']}) else: logging.info(f'无效ip: {proxy}') if method == 'check': MongoDB().delete({'proxy': proxy['proxy']}) except (ProxyError, ConnectTimeout): logging.info(f'无效ip: {proxy}') if method == 'check': MongoDB().delete({'proxy': proxy['proxy']}) except: pass
def test_same_lastname(self): """ Positive test, males' last names are the same""" mongo_instance = MongoDB() mongo_instance.drop_collection("family") mongo_instance.drop_collection("individual") ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_same.ged') ged.insert_to_mongo() self.assertEqual(ged.us16_male_last_name(debug=True), [])
def test_diff_lastname(self): """ Negative test, males' last names are different""" mongo_instance = MongoDB() mongo_instance.drop_collection("family") mongo_instance.drop_collection("individual") ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_diff.ged') ged.insert_to_mongo() self.assertEqual(ged.us16_male_last_name(debug=True), [('@F1@', '@I2@, @I3@', 'LastName,Test')])
def get_one(): proxies = MongoDB().get(1) result = [proxy['proxy'] for proxy in proxies] x = random.randint(0, MongoDB().get_count() - 1) return jsonify(dict(proxy=result[x]))
def delete(): args = request.args MongoDB().delete({'proxy': args['proxy']}) return '删除成功:{}'.format(args)
class ProvinceSpider(scrapy.Spider): ''' get province ''' name = 'province_spider' allowed_domains = ['weather.com.cn'] start_urls = ['http://www.weather.com.cn/province/'] def __init__(self): file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log' #self.logger = logger#logging.getLogger('weather.spider') self.log = Logger('province_spider', console=False, file_name=file_name).getLogger() self.db = MongoDB( auth=True, host='localhost', user='******', password='******', authSource='admin', authMechanism='SCRAM-SHA-1') self.db.remove('weather', 'wea', {}) super(ProvinceSpider, self).__init__() def parse(self, response): ''' 解析省 ''' provinces = [] for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'): name = li.xpath('.//text()').extract_first() if name not in constant.PIG_ZONE: provinces.append({ 'url': li.xpath('a/@href').extract_first(), 'province': name }) for p in provinces: yield scrapy.Request(p['url'], callback=self.parse_city, meta=p) def parse_city(self, response): ''' 解析市/区 ''' # 上级省/直辖市 province_info = response.meta cities = [] for a in response.xpath('//div[@class="navbox"]/span/a'): cities.append({ 'url': response.urljoin(a.xpath('@href').extract_first()), 'city': a.xpath('.//text()').extract_first() }) # shirt, 广东省的主页样式不一样 if not cities: for a in response.xpath('//div[@class="area_Weather"]/ul/li'): cities.append({ 'url': response.urljoin(a.xpath('./a/@href').extract_first()), 'city': a.xpath('./a/text()').extract_first() }) for c in cities: yield scrapy.Request(c['url'], callback=self.parse_county, meta={ 'province': province_info['province'], 'city': c['city'] }) def parse_county(self, response): ''' 解析县 ''' city_info = response.meta # 如果是直辖市, 没有下级县, 直接解析天气数据 if city_info['province'] in constant.DIRECT_CITY: self.parse_direct_weather(response, city_info) else: counties = [] for a in response.xpath('//div[@class="navbox"]/span/a'): counties.append({ 'url': response.urljoin(a.xpath('@href').extract_first()), 'county': a.xpath('.//text()').extract_first() }) for c in counties: city_info['county'] = c['county'] yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info) def parse_county_weather(self, response): ''' 解析县天气数据 ''' meta = response.meta self._parse_weather(response, meta) def parse_direct_weather(self, response, meta): ''' 解析直辖市天气数据 ''' #self.log.info('provicince:%s, city:%s', meta['province'], meta['city']) self._parse_weather(response, meta) def _parse_weather(self, response, meta): seven_day_weather = [] for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'): # 相对日期 h1 = li.xpath('./h1/text()').extract_first() # 描述 desc = li.xpath('./p[@class="wea"]/text()').extract_first() # 最高、低温度 max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first() min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first() # 风向 wind_direction = li.xpath('.//em/span/@title').extract() # 风力 可能会有隐患 wf = li.xpath('.//i/text()').extract() wind_force = wf[-1] if len(wf) >= 2 else 'unkonw' seven_day_weather.append({ 'day': h1, 'desc': desc, 'max_tem': max_tem, 'min_tem': min_tem, 'wind_direction': wind_direction, 'wind_force': wind_force }) self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None)) data = { 'province': meta['province'], 'city': meta['city'], 'county': meta.get('county', None), 'data': seven_day_weather } self.db.insert('weather', 'wea', data)
def __init__(self, collection_name, delay=5): self.db = MongoDB() self.kinesis = boto3.client("kinesis") self.collection_name = collection_name self.delay = delay self.shard_id = "shardId-000000000000" #only one shard!
""" US26: Less than 150 years old Benji, Feb 24th, 2019 Death should be less than 150 years after birth for dead people, and current date should be less than 150 years after birth for all living people """ import os import unittest from gedcom_ajry import Gedcom from mongo_db import MongoDB MONGO = MongoDB() class test_us26(unittest.TestCase): """ Test cases for US26""" def test_indi_entry_bleach(self): """ Individual data missed in family collection.""" self.assertEqual( Gedcom('GEDCOM_files/us26/us26_indi_entry_bleach.ged'). us26_corrspnding_entries(debug=True), [('Individual', '@I4@')]) def test_no_err(self): """ Positive test for US26.""" self.assertEqual( Gedcom('GEDCOM_files/us26/us26_no_err.ged'). us26_corrspnding_entries(debug=True), []) if __name__ == '__main__': unittest.main(exit=False, verbosity=2)