def get_details(self): """ 把网页放入队列 如果有list_page_url,返回url列表 :return: """ r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) channel = r.get_channel() channel.queue_declare(queue='hilder_gv') try: html_str = do_request(self.page_url, self.request_type, self.headers, self.encode) body = {'html': html_str, 'analyzer_type': self.analyzer_type, 'analyzer_rules_dict': self.analyzer_rules_dict, } # 放入队列 json.dumps(body) channel.basic_publish(exchange='', routing_key='hilder_gv', body=json.dumps(body)) r.connection.close() # print(json.dumps(body)) print('已经放入队列') if self.current_url_rule: current_page_list_url = self.get_current_page_url() return current_page_list_url except Exception as e: print(self.page_url, e)
def __init__(self): self.start_url = 'https://www.toutiao.com/ch/news_house/' browser = webdriver.ChromeOptions() browser.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=browser) self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
def consume_all_url(): rabbit = Rabbit(host='192.168.0.192', port=5673) connect_result = rabbit.connection channel = connect_result.channel() channel.basic_qos(prefetch_count=1) channel.basic_consume(callback, queue='amap_test') channel.start_consuming()
def __init__(self): self.proxy = [ { "https": "http://192.168.0.96:4234" }, { "https": "http://192.168.0.93:4234" }, { "https": "http://192.168.0.90:4234" }, { "https": "http://192.168.0.94:4234" }, { "https": "http://192.168.0.98:4234" }, { "https": "http://192.168.0.99:4234" }, { "https": "http://192.168.0.100:4234" }, { "https": "http://192.168.0.101:4234" }, { "https": "http://192.168.0.102:4234" }, { "https": "http://192.168.0.103:4234" }, ] self.rabbit_connection = Rabbit(setting['CEIC']['rabbit']['host'], setting['CEIC']['rabbit']['port'])
def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house" self.proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
def consume_all_url(api_key): rabbit = Rabbit(host=host, port=port) connection_result = rabbit.connection # connection = pika.BlockingConnection(pika.ConnectionParameters(host=host, port=port)) channel = connection_result.channel() # channel.basic_qos(prefetch_count=1) channel.basic_consume(callback, queue='amap_url_list', consumer_tag=api_key) channel.start_consuming()
def asyn_message(_url): try: result = requests.get(_url, timeout=5) print(result.text, _url) except Exception as e: log.info('request error,url={}'.format(_url)) return status = result.json()['status'] if status is '1': count = int(result.json()['count']) if count != 0: if count < 50: print('count < 50') channel_result = connection_result.channel() channel_result.queue_declare(queue='amap_result_json') channel_result.basic_publish(exchange='', routing_key='amap_result_json', body=json.dumps(result.json())) channel_result.close() else: print('count > 50') r = Rabbit('192.168.0.192', 5673) channel_page = r.get_channel() # connection_page = pika.BlockingConnection( # pika.ConnectionParameters(host='192.168.0.192', port=5673)) # channel_page = connection_page.channel() channel_page.queue_declare(queue='amap_page_url') for i in range(1, int(count / 50 + 0.5)): channel_page.basic_publish( exchange='', routing_key='amap_page_url', body=result.url + '&page=' + str(i + 1), ) print('分页 的url放入') channel_page.close() else: log.info('url={},result={}'.format(_url, result.text))
"https": "http://192.168.0.103:4234" }, ] m = Mongo('192.168.0.235') connect = m.connect setting = yaml.load(open('config.yaml')) db_name = setting['CEIC']['mongo']['db'] State_indicators_name = setting['CEIC']['mongo']['State_indicators'] State_indicators_details_name = setting['CEIC']['mongo'][ 'State_indicators_details'] log = LogHandler('ceic_detail') r = Rabbit(setting['CEIC']['rabbit']['host'], setting['CEIC']['rabbit']['port']) class Detail: def create_date( self, indexFrequency, start_year, start_mouth, end_year, ): """ :return: ['from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1',] """ """
""" import json from lib.rabbitmq import Rabbit from functools import partial import sys import yaml import pika import trip setting = yaml.load(open('config.yaml')) host = setting['amap']['rabbitmq']['host'] port = setting['amap']['rabbitmq']['port'] message_list = [] r_result = Rabbit(host=host, port=port) connection_result = r_result.connection channel = connection_result.channel() def callback(ch, method, properties, body): """ {'type': '010000', 'square_list': [73.010906, 44.471043, 73.510906, 43.971043]} :param ch: :param method: :param properties: :param body: :return: """ body = json.loads(body.decode('utf8'))
from lib.log import LogHandler from lib.mongo import Mongo import yaml setting = yaml.load(open('config.yaml')) # mongodb m = Mongo('192.168.0.235') connect = m.connect db_name = setting['CEIC']['mongo']['db'] State_indicators_name = setting['CEIC']['mongo']['State_indicators'] State_indicators_details_name = setting['CEIC']['mongo'][ 'State_indicators_details'] # rabbit r = Rabbit(setting['CEIC']['rabbit']['host'], setting['CEIC']['rabbit']['port']) channel = r.get_channel() queue = setting['CEIC']['rabbit']['queue'] channel.queue_declare(queue=queue) log = LogHandler('ceic_detail') class Consumer(object): def callback(self, ch, method, properties, body): ip = method.consumer_tag body = json.loads(body.decode()) url = body['url'] countryEnName = body['countryEnName'] indexEnName = body['indexEnName'] while True: proxy_ = {'https': ip}
class Toutiao: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house" self.proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) def start_crawler(self): channel = self.rabbit.get_channel() channel.queue_declare(queue='toutiao') while True: try: self.url_list_crawler(channel) time.sleep(60) except: continue def url_list_crawler(self, channel): while True: proxy = self.proxies[random.randint(0, 9)] try: response = requests.get(self.start_url, headers=self.headers, proxies=proxy) url_dict = json.loads(response.text) url_list = url_dict["data"] break except: continue for url_content in url_list: con = url_content["content"] try: url = re.search('display_url":"(.*?)"', con).group(1) except: continue if re.search('wukong', url): continue else: if self.bf.is_contains(url): # 过滤详情页url log.info('bloom_filter已经存在{}'.format(url)) continue else: self.bf.insert(url) log.info('bloom_filter不存在,插入新的url:{}'.format(url)) article = Article('今日头条') comment_code = Comment_url() try: organization_author = re.search( '\\"source\\":\\"(.*?)\\"', con).group(1) article.organization_author = organization_author except Exception as e: log.info('没有organization_author') title = re.findall('"title":"(.*?)"', con)[1] article.title = title article.url = url article.article_id = re.search('group/(\d+)', url).group(1) comment_code.group_id = article.article_id comment_code.crawler_time = datetime.datetime.utcnow() try: comment_count = re.search('\\"comment_count\\":(\d+)', con).group(1) article.comment_count = comment_count comment_code.comment_count = comment_count except Exception as e: log.info('{}这篇文章没有评论'.format(title)) try: title_img = re.search( 'middle_image.*?"url":"(.*?.webp)', con).group(1) new_title_img = qiniufetch(title_img, title_img) article.title_img = new_title_img except Exception as e: log.info('{}这篇文章没有list图片:'.format(title)) channel.basic_publish(exchange='', routing_key='toutiao', body=json.dumps(article.to_dict())) log.info('已经放入队列')
""" 爬取顺序:城市-区域-街道-菜系 start:3 """ from dianping.request_detail import request_get from lxml import etree import json import yaml import pika from lib.rabbitmq import Rabbit setting = yaml.load(open('config.yaml')) # rabbit r = Rabbit(setting['dianping']['rabbit']['host'], setting['dianping']['rabbit']['port']) connection = r.connection channel = connection.channel() region_queue = setting['dianping']['rabbit']['queue']['region_queue'] street_queue = setting['dianping']['rabbit']['queue']['street_queue'] first_queue = setting['dianping']['rabbit']['queue']['first_queue'] list_queue = setting['dianping']['rabbit']['queue']['list_queue'] channel.queue_declare(queue=region_queue) # 放入html队列 def html_put_in_queue(data): channel.queue_declare(queue=first_queue) channel.basic_publish(exchange='', routing_key=first_queue, body=json.dumps(data),
大于50条,则放入amap_page_url队列 """ import json from lib.rabbitmq import Rabbit from functools import partial import sys import yaml import pika import trip setting = yaml.load(open('config.yaml')) host = setting['amap']['rabbitmq']['host'] port = setting['amap']['rabbitmq']['port'] r_result = Rabbit(host=host, port=port) r_page = Rabbit(host=host, port=port) def requests_a(result): print('-----------------{}'.format(result.text)) if 'DAILY_QUERY_OVER_LIMIT' in result.text: sys.exit() try: status = result.json()['status'] except Exception as e: print(e) print(result) return if status is '1': count = int(result.json()['count'])
def __init__(self): self.rabbit_connection = Rabbit(setting['CEIC']['rabbit']['host'], setting['CEIC']['rabbit']['port'])
from xiaozijia.user_headers import get_headers log = LogHandler('小资家_build') setting = yaml.load(open('config.yaml')) # mongo m = Mongo(setting['xiaozijia']['mongo']['host'], setting['xiaozijia']['mongo']['port'], user_name=setting['xiaozijia']['mongo']['user_name'], password=setting['xiaozijia']['mongo']['password']) coll_build = m.connect[setting['xiaozijia']['mongo']['db']][ setting['xiaozijia']['mongo']['build_coll']] # rabbit r = Rabbit(setting['xiaozijia']['rabbit']['host'], setting['xiaozijia']['rabbit']['port']) channel = r.get_channel() build_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_build'] house_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_house'] channel.queue_declare(queue=build_queue) channel.queue_declare(queue=house_queue) headers = get_headers() def get_build_info(ch, method, properties, body): """ 消费xiaozijia_build队列,请求,入小区库,并放入房号页 :param ch: :param method: :param properties:
m = Mongo(setting['fgg_price_mongo']['host'], setting['fgg_price_mongo']['port']) fgg = m.connect[setting['fgg_price_mongo']['db']] coll = fgg[setting['fgg_price_mongo']['coll_fanggugu_price']] fgg = m.connect[setting['fgg_price_mongo']['db']] coll_test = fgg[setting['fgg_price_mongo']['coll_fanggugu_price_update']] fgg = m.connect[setting['fgg_price_mongo']['db']] coll_user = fgg[setting['fgg_price_mongo']['coll_user_info']] fgg = m.connect[setting['fgg_price_mongo']['db']] coll_login = fgg[setting['fgg_price_mongo']['coll_login']] # 链接 rabbit r = Rabbit(setting['fgg_price_rabbit']['host'], setting['fgg_price_rabbit']['port'], ) channel = r.get_channel() channel.queue_declare(queue='fgg_comm_id') IPS = ["192.168.0.90:4234", "192.168.0.93:4234", "192.168.0.94:4234", "192.168.0.96:4234", "192.168.0.98:4234", "192.168.0.99:4234", "192.168.0.100:4234", "192.168.0.101:4234", "192.168.0.102:4234", "192.168.0.103:4234"]
import json import datetime import re import time setting = yaml.load(open('config_local.yaml')) log = LogHandler('article_consumer') m = MongoClient(setting['mongo_config']['config_host'], setting['mongo_config']['port']) m.admin.authenticate(setting['mongo_config']['user_name'], setting['mongo_config']['password']) collection = m[setting['mongo_config']['config_db']][setting['mongo_config'] ['coll_detail']] clean_coll = m[setting['mongo_config']['config_db']][setting['mongo_config'] ['clean']] rabbit = Rabbit(setting['rabbitmq_host'], setting['rabbitmq_port']) connection = rabbit.connection class CrawlerDetail: def __init__(self): self.proxy = Proxies() def start_consume(self): channel = connection.channel() channel.queue_declare(queue='usual_article') channel.basic_qos(prefetch_count=1) channel.basic_consume(self.consume_article_detail_url, queue='usual_article', no_ack=False) channel.start_consuming()
import requests from lib.mongo import Mongo from lib.rabbitmq import Rabbit from urllib import parse from lib.log import LogHandler from backup.anew_fanggugu.user_names import username_list from progressbar import * log = LogHandler(__name__) r = Rabbit('127.0.0.1', 5673) channel = r.get_channel() channel.queue_declare(queue='fgg_user_city') m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******') # m = Mongo('127.0.0.1', 27018) coll_comm = m.connect['fgg']['comm'] coll_build = m.connect['fgg']['build'] coll_house = m.connect['fgg']['house'] class ConsumerCity(object): """ 获取所有的城市的小区,楼栋,房号,面积 """ def __init__(self): self.p = ProgressBar() self.headers = {'Authorization': ""} self.s = requests.session() self.currentCity = '' self.currentCityPy = ''
def __init__(self): self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
class Toutiao_Consumer: def __init__(self): self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], ) @staticmethod def parse_html(res): # res = requests.get(url=url, headers=headers) # 切割url() # article_id = re.search('\d+', url).group() if 'articleInfo' in res.text: # 今日头条的url readable_title = Document(res.content).short_title() readable_article_ = re.search("articleInfo.*?content.*?'(.*?)'", res.content.decode(), re.S | re.M).group(1) readable_article = html_parser.unescape(readable_article_) source_detail = '今日头条' img_change = ImageReplace() readable_article = img_change.image_download(readable_article) #对今日头条来源的文章内容进行图片连接替换 else: # 其他来源的文章 html_byte = re.sub(b'<script.*script>', b'', res.content, ) encode_dict = chardet.detect(html_byte) encode_type = encode_dict['encoding'] readable_title = Document(html_byte.decode(encode_type)).short_title() readable_article = Document(html_byte.decode(encode_type)).summary() source_detail = 'other' return readable_title, readable_article,source_detail @staticmethod def get_post_time(res): if 'articleInfo' in res.text: # 今日头条的url time = re.search("time: '(.*?)'", res.content.decode(), re.S | re.M).group(1) return time else: return None def callback(self, ch, method, properties, body): body = json.loads(body.decode()) article = Article(body['source']) article.dict_to_attr(body) url = article.url while True: try: res = requests.get(url=url, headers=headers, proxies=proxies[random.randint(0, 9)], timeout=10) res.encoding = 'utf-8' if '<html><head></head><body></body></html>' not in res.text: break except Exception as e: log.error('网络请求错误{}'.format(e)) readable_title, readable_article,source_detail = self.parse_html(res) article.post_time = self.get_post_time(res) article.body = readable_article article.source_detail = source_detail article.crawler_time = datetime.datetime.now() if '<body id="readabilityBody"/>' in article.body: log.info("文章为空") else: article.insert_db() log.info('{}一篇文张入库成功'.format('今日头条')) ch.basic_ack(delivery_tag=method.delivery_tag) def consume_connect(self): connect = self.rabbit.get_connection() self.channel = connect.channel() self.channel.basic_qos(prefetch_count=1) self.channel.basic_consume(self.callback, queue='toutiao', no_ack=False) def start_consume(self): disconnected = True while disconnected: try: disconnected = False self.channel.start_consuming() except Exception as e: disconnected = True self.consume_connect()
from lib.rabbitmq import Rabbit from lib.mongo import Mongo import datetime import yaml setting = yaml.load(open('config.yaml')) # 连接 MongoDB m = Mongo(setting['comm_price']['host'], setting['comm_price']['port']) fgg = m.connect[setting['comm_price']['db']] coll = fgg[setting['comm_price']['fgg_coll']] coll_login = fgg[setting['fgg']['user_info']] # 连接rabbit r = Rabbit('192.168.0.235', 5673) channel = r.get_channel() IPS = ["192.168.0.90:4234", "192.168.0.93:4234", "192.168.0.94:4234", "192.168.0.96:4234", "192.168.0.98:4234", "192.168.0.99:4234", "192.168.0.100:4234", "192.168.0.101:4234", "192.168.0.102:4234", "192.168.0.103:4234"] login = Login()
class Toutiao: def __init__(self): self.start_url = 'https://www.toutiao.com/ch/news_house/' browser = webdriver.ChromeOptions() browser.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=browser) self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], ) def start_crawler(self): self.driver.get(self.start_url) time.sleep(5) channel = self.rabbit.get_channel() channel.queue_declare(queue='article_test') while True: self.find_list_info(channel) self.driver.refresh() time.sleep(20) def find_list_info(self, channel): article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li') print('len, ', len(article_list)) for i in article_list: if '看到这里' in i.text: print('看到这里') break try: wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text except Exception as e: continue if '悟空问答' in wenda: print('悟空问答') continue article_id = i.get_attribute('group_id') # article_id进入布隆过滤器 if self.bf.is_contains(article_id): print('bloom_filter已经存在!') continue else: self.bf.insert(article_id) print('bloom_filter不存在,插入article_id!') article = Article('今日头条') try: organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace( '⋅', '') article.organization_author = organization_author.strip() except Exception as e: print('没有organization_author') title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text article.title = title url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href') article.url = url # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text # article.post_time = post_time try: comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text comment_count = int(re.search('\d+', comment_str, re.S | re.M).group()) article.comment_count = comment_count except Exception as e: print('这篇文章没有评论', title) try: title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src') article.title_img = [title_img] except Exception as e: print('这篇文章没有list图片:', title) print(article.to_dict()) # 没有在过滤器的文章加入rabbitmq channel.basic_publish(exchange='', routing_key='article_test', body=json.dumps(article.to_dict())) print('已经放入队列')
class Consumer(object): r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) channel = r.get_channel() channel.queue_declare(queue='hilder_gv') def callback(self, ch, method, properties, body): body = json.loads(body.decode()) analyzer_rules_dict = body['analyzer_rules_dict'] analyzer_type = body['analyzer_type'] co_index = analyzer_rules_dict['co_index'] data_type = analyzer_rules_dict['data_type'] html = body['html'] try: self.common_use(analyzer_type, co_index, data_type, html, analyzer_rules_dict) except Exception as e: print(e) ch.basic_ack(delivery_tag=method.delivery_tag) def common_use(self, analyzer_type, co_index, data_type, html, analyzer_rules_dict): if data_type == 'comm': info = self.rule_data(analyzer_type, analyzer_rules_dict, html) try: self.put_database(info, data_type, co_index=co_index) except Exception as e: print(e) elif data_type == 'build': co_id_rule = analyzer_rules_dict['co_id'] co_name_rule = analyzer_rules_dict['co_name'] co_id = self.rule_type(analyzer_type, html, co_id_rule) co_name = self.rule_type(analyzer_type, html, co_name_rule) co_id = self.have_no_have(co_id) co_name = self.have_no_have(co_name) del analyzer_rules_dict['co_id'] del analyzer_rules_dict['co_name'] info = self.rule_data(analyzer_type, analyzer_rules_dict, html) try: self.put_database(info, data_type, co_index=co_index, co_id=co_id, co_name=co_name) except Exception as e: print(e) elif data_type == 'house': bu_id_rule = analyzer_rules_dict['bu_id'] bu_num_rule = analyzer_rules_dict['bu_num'] bu_id = self.rule_type(analyzer_type, html, bu_id_rule) bu_num = self.rule_type(analyzer_type, html, bu_num_rule) bu_id = self.have_no_have(bu_id) bu_num = self.have_no_have(bu_num) del analyzer_rules_dict['bu_id'] del analyzer_rules_dict['bu_num'] info = self.rule_data(analyzer_type, analyzer_rules_dict, html) try: self.put_database(info, data_type, co_index=co_index, bu_id=bu_id, bu_num=bu_num) except Exception as e: print(e) def rule_type(self, rule_type, html, rule): if rule: if rule_type == 'regex': data = re.findall(rule, html, re.S | re.M) return data else: tree = etree.HTML(html) data = tree.xpath(rule) return data else: return None @staticmethod def rule_data(analyzer_type, analyzer_rules_dict, html): tree = etree.HTML(html) info = {} for i in analyzer_rules_dict: if not analyzer_rules_dict[i]: continue if i == 'co_index' or i == 'data_type': continue if analyzer_type == 'regex': info_list = re.findall(analyzer_rules_dict[i], html, re.M | re.S) else: info_list = tree.xpath(analyzer_rules_dict[i]) if info_list: info[i] = info_list if not info: print('\n\n没有选取到任何信息\n\n') return info @staticmethod def have_no_have(num): if num: return num[0] else: return None @staticmethod def add_attr(obj, info, index): for key, value in info.items(): if value: setattr(obj, key, value[index].strip()) obj.insert_db() # 遍历字典放入数据库 def put_database(self, info, analyzer, co_index, bu_id=None, bu_num=None, co_id=None, co_name=None): key = sorted(info.items())[0][0] length = len(info[key]) for i in range(0, length): obj = self.get_data_obj(analyzer, co_index) if analyzer == 'comm': pass elif analyzer == 'build': if co_id: setattr(obj, 'co_id', co_id) if co_name: setattr(obj, 'co_name', co_name) elif analyzer == 'house': if bu_id: setattr(obj, 'bu_id', bu_id.strip()) if bu_num: setattr(obj, 'bu_num', bu_num.strip()) self.add_attr(obj, info, i) # 创建对象(data_type是什么类型是就创建什么对象) def get_data_obj(self, analyzer, co_index): if analyzer == 'comm': return Comm(co_index) elif analyzer == 'build': return Building(co_index) elif analyzer == 'house': return House(co_index) def consume_queue(self): self.channel.basic_qos(prefetch_count=1) self.channel.basic_consume(self.callback, queue='hilder_gv') self.channel.start_consuming()
def connect_rabbit(self): r = Rabbit(self.r_host, self.r_port) return r.get_channel()
import requests import json from lib.log import LogHandler from lib.mongo import Mongo from lib.rabbitmq import Rabbit from xiaozijia_gevent.user_headers import get_headers from xiaozijia_gevent.user_list import user_list from multiprocessing import Process from gevent import monkey import random log = LogHandler(__name__) m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******') coll_detail = m.connect['friends']['xiaozijia_detail'] r = Rabbit('localhost', 5673) channel = r.get_channel() gevent.monkey.patch_all() headers = '' def detail_message(info): global headers data = json.loads(info) username = random.choice(user_list) headers = get_headers(username) id = data['Id'] ConstructionName = data['ConstructionName'] try: detail_url = 'http://www.xiaozijia.cn/HouseInfo/' + str(id) result = requests.get(detail_url, headers=headers, timeout=10)
from login_fgg import Login from lib.mongo import Mongo from lib.rabbitmq import Rabbit import random import json import requests r = Rabbit('192.168.0.190', 5673) connection = r.connection channel = connection.channel() channel.queue_declare(queue='fgg_all_city_code') m = Mongo('192.168.0.235', 27017) connect = m.connect coll = connect['fgg']['user_info'] login = Login() ips = [ "192.168.0.90:4234", "192.168.0.93:4234", "192.168.0.94:4234", "192.168.0.96:4234", "192.168.0.98:4234", "192.168.0.99:4234", "192.168.0.100:4234", "192.168.0.101:4234", "192.168.0.102:4234", "192.168.0.103:4234" ] known = '上海 35484,北京 20866,广州 16641,深圳 23559,大连 20751,厦门 15265,银川 17000,成都 13000,杭州 13000' def put_queue_comm_id(): headers = { 'Cookie':