def __init__(self): self.base_url = "https://weixin.sogou.com/weixin" self.keyword = KEY self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;' 'q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': COOKIES, 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/73.0.3683.86 Safari/537.36', } self.session = Session() self.queue = RedisQueue() self.mysql = Mysql()
class RedisQueueWorker(object): def __init__(self, redis_mgr, service_name, custom_key, func_name, callback_to_main_thread=False): self.service_name = service_name self.func_name = func_name self.redis_queue = RedisQueue(redis_mgr) self.custom_key = custom_key self.redis_queue.subscribe(self.service_name, custom_key) self.callback_to_main_thread = callback_to_main_thread def _real_start(self): while True: try: item = self.redis_queue.get(self.custom_key) if self.callback_to_main_thread: IOLoop.instance().add_callback(self.func_name, item) else: self.func_name(item) except Exception, e: logger.warn("start_work error:%s not found msg", e.message) time.sleep(1)
def main(): with RecordsDB() as records_db: records_parser = RecordsParser(records_db) with CountriesDB() as countries_db: queue = RedisQueue(name='jobs', namespace='queue', decode_responses=True) job_in_json = queue.wait_and_dequeue() while job_in_json is not None: job = json.loads(job_in_json) country_id = job['country_id'] country_name = countries_db.get_country_from_id(country_id) num_records = job['num_records'] if country_name is None: raise Exception("Country name cannot be None!") records_parser.get_records(country=country_name, country_id=country_id, max_records=num_records) job_in_json = queue.wait_and_dequeue()
class RedisPublisher(object): def __init__(self, topic, host='localhost', port=6379, db=0): self.rqueue = RedisQueue(topic, 1, host=host, port=port, db=db) def redis_send_pyobj(self, obj): self.rqueue.put_and_trim(pkl.dumps(obj)) def redis_send(self, msg): self.rqueue.put_and_trim(msg)
def populate_job_queue(): queue = RedisQueue('jobs') with CountriesDB() as countries_db: countries = countries_db.get_countries() for country in countries: job = {'country_id': country[0], 'num_records': 5000} job_in_json = json.dumps(job) queue.enqueue(job_in_json)
def __init__(self, status_queue, config, stop_event): super().__init__() self.energy_data_queue = RedisQueue('normal') self.status_queue = status_queue self.reader = self.init_reader() self.solar_ip = config['solar_ip'] self.solar_url = self.solar_ip + config['solar_url'] self.stop_event = stop_event self.console_mode = True if config["console_mode"] == "true" else False
def __init__(self): self.trigger = False self.result_queue = RedisQueue(Config.UP_QUEUE_NAME) self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME) self.port = serial.Serial("/dev/ttyS0", 9600, parity=serial.PARITY_NONE, stopbits=serial.STOPBITS_ONE, bytesize=serial.EIGHTBITS, timeout=Config.SERIAL_WAIT) self.start()
def __init__(self, stop_event): super().__init__() self.energy_data_queue = RedisQueue('normal') self.stop_event = stop_event self.default_message = self.get_default_message() self.total_usage = random.randint(1000, 5000) self.total_redelivery = random.randint(1000, 5000) self.total_solar = random.randint(1000, 5000) self.total_gas = random.randint(1000, 5000)
class RedisSubscriber(object): def __init__(self, topic, host='localhost', port=6379, db=0): self.rqueue = RedisQueue(topic, 1, host=host, port=port, db=db) def redis_recv_pyobj(self, blocking=True): item = self.rqueue.get(isBlocking=blocking) if item is None: return None return pkl.loads(item) def redis_recv(self, blocking=True): return self.rqueue.get(isBlocking=blocking)
def __init__(self, redis_mgr, service_name, custom_key, func_name, callback_to_main_thread=False): self.service_name = service_name self.func_name = func_name self.redis_queue = RedisQueue(redis_mgr) self.custom_key = custom_key self.redis_queue.subscribe(self.service_name, custom_key) self.callback_to_main_thread = callback_to_main_thread
def __init__(self, thread_index, global_network): self.thread_index = thread_index self.local_network = global_network self.game_state = GameState() self.local_t = 0 # for log self.episode_reward = 0.0 self.episode_start_time = 0.0 self.prev_local_t = 0 self.rq = RedisQueue(REDIS_QUEUE_NAME) return
def __init__(self, time_execution_in_sec, chart_title, slave, *args, **kwargs): super(MyTaskSet, self).__init__(time_execution_in_sec, chart_title, slave, *args, **kwargs) self.running = True self.slave = slave self.code = None self.queue_chart = RedisQueue(name="data_chart", namespace="data_chart") self.queue_tasks = RedisQueue(name="data_tasks", namespace="data_tasks") self.chart = ReportCharts(time_execution_in_sec, chart_title, self.slave) self.db = create_engine(self.config["database"]["db_string"])
def __init__(self, status_queue, stop_event, config): super(Sender, self).__init__() self.normal_data_queue = RedisQueue('normal') self.retry_data_queue = RedisQueue('retry') self.status_queue = status_queue self.stop_event = stop_event self.base_url = config["api_url"] self.key = config["key"] self.store_energy_url = self.base_url + "/v2/energy" self.backup_file = "backup" self.console_mode = True if config["console_mode"] == "true" else False self.connected = False
class WebServerClass(BaseHTTPRequestHandler): def __init__(self, *args, **kwargs): self.singleton = Singleton() self.queue_chart = RedisQueue(name="data_chart", namespace="data_chart") super(WebServerClass, self).__init__(*args, **kwargs) def _set_headers(self): self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() def do_GET(self): if self.path == "/": self.path = self.singleton.template_path self._set_headers() f = open(self.singleton.template_path, "rb") self.wfile.write(f.read()) f.close() while True: data = self.queue_chart.get().decode("utf-8") data = data.replace("\'", "\"") data = json.loads(data) if data: self.wfile.write( "<script type=\"text/javascript\">AddDataChart(chart_id={}, data={});</script>" .format( # noqa data["chart_id"], data["data"]).encode(encoding='utf_8')) return
def main(): comment_queue = RedisQueue('reddit-book-stream', host=REDIS_HOST, port=REDIS_PORT) reddit = praw.Reddit(user_agent=USER_AGENT, client_id=CLIENT_ID, client_secret=CLIENT_SECRET, username=USERNAME, password=PASSWORD) # stream comments from r/all, pasue_after < 0 allows faster streaming for comment in reddit.subreddit('all').stream.comments(pause_after=-1): if comment and mentions_book(comment.body): comment_queue.put(comment.id) print(comment.id) print(f'reddit.com/api/info?id=t1_{comment.id}')
def main(): rq = RedisQueue('reddit-book-stream', host=REDIS_HOST, port=REDIS_PORT) reddit = praw.Reddit(user_agent=USER_AGENT, client_id=CLIENT_ID, client_secret=CLIENT_SECRET, username=USERNAME, password=PASSWORD)
def __init__(self, url): self.queue = RedisQueue('zhihu', host='localhost', port=6379, db=0) self.url = url self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" , "Host":"www.zhihu.com" , "Refer":"www.zhihu.com" , "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4" , "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" , "Accept-Encoding":"gzip, deflate, sdch, br" , "Cache-Control":"max-age=0" , "Connection":"keep-alive"} #cookie self.cookies={"_zap":"aaf2a75d-0a1b-4863-b8a0-23ff0f4a9002" , "_za":"e73a8db5-0824-4c36-b6a2-7a5378a046f7" , "udid":'"AFAAY31blAmPTta9QIqu7S6lUdEK97RWDgg=|1457941793"' , "d_c0":'"AGBAzqyTowmPTpYh7UrYZSjcr43LFX006Tw=|1461248461"' , "_zap":"267bc327-098d-4d7c-85cb-3cfd13cd2e8e" , "q_c1":"3b3a3dccecf1499ea32a0b2da9be35ec|1470149980000|1445741536000" , "_xsrf":"8a812fd7745e54a8e8ab4ed815fa9001" , "l_cap_id":'"YzQ3YzNhNzUxZjBlNDAzNTgwM2FhNzdlODI5NjAxZjY=|1472298711|d67a5a1c7e5fb41cfe2715e389c74ebc6132007d"' , "cap_id":'"ZGQwYTE0MTM3ODk0NDUzOGFkM2RiNGYxYTNmYTc1YTM=|1472298711|8fd9f406e4786a9ca56227b61e7c6a2a5c0f4b42"' , "login":'******' , "n_c":'1' , "s-t":"autocomplete" , "s-q":"volley%2Cretrofit%2Cokhttp" , "s-i":"1" , "sid":"6vahoruo" , "a_t":'"2.0AEAAukjbcgoXAAAATjPpVwBAALpI23IKAGBAzqyTowkXAAAAYQJVTfYL6VcAoZ3PJyuvTIR4Yl3RS9B_tCnMwHxnX7iDfjl2Ve7xk-Nk6RdV68h4_A=="' , "z_c0":"Mi4wQUVBQXVramJjZ29BWUVET3JKT2pDUmNBQUFCaEFsVk45Z3ZwVndDaG5jOG5LNjlNaEhoaVhkRkwwSC0wS2N6QWZB|1472308814|21bb41cc3844239f4582374fc850ced4a5e8c564" , "__utma":"51854390.226515891.1472287250.1472298703.1472307196.4" , "__utmc":"51854390" , "__utmz":"51854390.1472296126.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)" , "__utmv":"51854390.100--|2=registration_date=20160827=1^3=entry_date=20151025=1"}
def __init__(self): self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME) self.result_queue = RedisQueue(Config.UP_QUEUE_NAME) self.socket = websocket.WebSocketApp(HOST, on_open=self.on_open, on_message=self.on_message, on_error=self.on_error, on_close=self.on_close) while True: try: self.socket.run_forever(ping_interval=100) except: pass time.sleep(5)
class SerialWorker: def __init__(self): self.trigger = False self.result_queue = RedisQueue(Config.UP_QUEUE_NAME) self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME) self.port = serial.Serial("/dev/ttyS0", 9600, parity=serial.PARITY_NONE, stopbits=serial.STOPBITS_ONE, bytesize=serial.EIGHTBITS, timeout=Config.SERIAL_WAIT) self.start() def start(self): while True: self.executeTask() time.sleep(Config.SERIAL_CYC) def executeTask(self): GPIO.output(Config.EN_485, GPIO.HIGH) command = self.command_queue.get_nowait() if not command: self.trigger = not self.trigger if self.trigger: command = DEFAULT_COMMAND else: command = DEFAULT_COMMAND2 print 'write to 485 %s' % command command = CommandHelper.toWriteable(command) self.port.write(command) while self.port.out_waiting > 0: time.sleep(0.01) GPIO.output(Config.EN_485, GPIO.LOW) result = self.port.readall() if result: result = CommandHelper.toReadable(result) print 'receive from 485 %s' % result self.result_queue.put(result)
def __init__(self): self.device = '/gpu:0' if USE_GPU else '/cpu:0' self.stop_requested = False self.global_t = 0 if USE_LSTM: self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1) else: self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device) self.global_network.create_loss(ENTROPY_BETA) self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) print 'initial_learning_rate:', self.initial_learning_rate self.learning_rate_input = tf.placeholder('float') self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) grads_and_vars = self.optimizer.compute_gradients( self.global_network.total_loss, self.global_network.get_vars()) self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars) self.actor_threads = [] for i in range(PARALLEL_SIZE): actor_thread = A3CActorThread(i, self.global_network) self.actor_threads.append(actor_thread) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables()) self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph) self.saver = tf.train.Saver() self.restore() self.lock = threading.Lock() self.rq = RedisQueue(REDIS_QUEUE_NAME) self.train_count = 0 return
class RedisQueueSender(object): executor = ThreadPoolExecutor(2) def __init__(self, redis_mgr): self.redis_mgr = redis_mgr self.queue = RedisQueue(redis_mgr) @run_on_executor def send(self, service_name, data): return self.queue.send_msg(service_name, data)
def __init__(self, name, collector): if SCHEDULER_PERSIST: # 如果使用分布式或者是持久化,使用redis的队列 self.queue = RedisQueue(name=name) self._filter_container = RedisFilterContainer( ) # 使用redis作为python的去重的容器 else: self.queue = Queue() self._filter_container = NoramlFilterContainer( ) # 使用Python的set()集合 # 统计重复的数量 self.collector = collector
async def main(): msg = "stockx 爬虫 Starting!" print(msg) logging.info(msg) q = RedisQueue('rq') # 建立 client request async with aiohttp.ClientSession() as client: for k, v in URL.items(): for page in range(1, 25): api_url = DOMAIN + v + str(page) task = asyncio.create_task(spiderList(client, api_url, q)) await asyncio.sleep(10) done, pending = await asyncio.wait({task}) if task in done: print('[爬取完成]所有爬取进程已经全部完成') logging.info("[爬取完成]所有爬取进程已经全部完成")
def test_failure(self): qkey = 'test:failure:queue' tid = '12' rq = RedisQueue(self.rc, 1, 2) self.rc.lpush(qkey, tid) tid0 = rq.safe_pop(qkey) self.assertEqual(tid, tid0) ## Popping another task too fast, before the task timeout has been ## reached. tid1 = rq.safe_pop(qkey) self.assertIsNone(tid1) ## Supposing the worker had died before finishing the task, we can take ## it again after the task timeout. time.sleep(2) tid2 = rq.safe_pop(qkey) self.assertEqual(tid, tid2) ## Marking the task as done should make impossible to retrieve the same ## task. rq.mark_done(qkey, tid2) time.sleep(2) tid3 = rq.safe_pop(qkey) self.assertIsNone(tid3) self.rc.delete(qkey, "%s:done" % qkey)
async def main(loop): print("开始爬虫") # 等待mysql连接好 pool = await aiomysql.create_pool(host=conf.database['host'], port=conf.database['port'], user=conf.database['user'], password=conf.database['passwd'], db=conf.database['db'], loop=loop) q = RedisQueue('rq') for k, v in URL.items(): for page in range(30): api_url = DOMAIN + v + str(page) task = asyncio.create_task(spiderList(pool, api_url, q)) await asyncio.sleep(1) done, pending = await asyncio.wait({task}) if task in done: print('[爬取完成]所有爬取进程已经全部完成') logging.info("[爬取完成]所有爬取进程已经全部完成")
class A3CActorThread(object): def __init__(self, thread_index, global_network): self.thread_index = thread_index self.local_network = global_network self.game_state = GameState() self.local_t = 0 # for log self.episode_reward = 0.0 self.episode_start_time = 0.0 self.prev_local_t = 0 self.rq = RedisQueue(REDIS_QUEUE_NAME) return def choose_action(self, policy_output): if random.random() < RANDOM_ACTION_PROBILITY: return random.randint(0, ACTION_DIM - 1) values = [] sum = 0.0 for rate in policy_output: sum += rate values.append(sum) r = random.random() * sum for i in range(len(values)): if values[i] >= r: return i return len(values) - 1 def _record_log(self, sess, global_t, summary_writer, summary_op, reward_input, reward, time_input, living_time): summary_str = sess.run(summary_op, feed_dict={ reward_input: reward, time_input: living_time }) summary_writer.add_summary(summary_str, global_t) return def process(self, sess, global_t, summary_writer, summary_op, reward_input, time_input): states = [] actions = [] rewards = [] values = [] terminal_end = False # reduce the influence of socket connecting time if self.episode_start_time == 0.0: self.episode_start_time = timestamp() start_local_t = self.local_t for i in range(LOCAL_T_MAX): policy_, value_ = self.local_network.run_policy_and_value( sess, self.game_state.s_t) if self.thread_index == 0 and self.local_t % 1000 == 0: print 'policy=', policy_ print 'value=', value_ action_id = self.choose_action(policy_) states.append(self.game_state.s_t) actions.append(action_id) values.append(value_) self.game_state.process(action_id) reward = self.game_state.reward terminal = self.game_state.terminal self.episode_reward += reward rewards.append(reward) self.local_t += 1 # s_t1 -> s_t self.game_state.update() if terminal: terminal_end = True episode_end_time = timestamp() living_time = episode_end_time - self.episode_start_time self._record_log(sess, global_t, summary_writer, summary_op, reward_input, self.episode_reward, time_input, living_time) print("global_t=%d / reward=%.2f / living_time=%.4f") % ( global_t, self.episode_reward, living_time) # reset variables self.episode_reward = 0.0 self.episode_start_time = episode_end_time self.game_state.reset() if USE_LSTM: self.local_network.reset_lstm_state() break # log if self.local_t % 2000 == 0: living_time = timestamp() - self.episode_start_time self._record_log(sess, global_t, summary_writer, summary_op, reward_input, self.episode_reward, time_input, living_time) # -----------end of batch (LOCAL_T_MAX)-------------------- R = 0.0 if not terminal_end: R = self.local_network.run_value(sess, self.game_state.s_t) # print ('global_t: %d, R: %f') % (global_t, R) states.reverse() actions.reverse() rewards.reverse() values.reverse() batch_state = [] batch_action = [] batch_td = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + GAMMA * R td = R - Vi action = np.zeros([ACTION_DIM]) action[ai] = 1 batch_state.append(si) batch_action.append(action) batch_td.append(td) batch_R.append(R) # put in into redis queue for asychronously train data = cPickle.dumps((si, action, td, R)) self.rq.put(data) diff_local_t = self.local_t - start_local_t return diff_local_t
# -*- coding:utf-8 -*- __author__ = '张全亮' import requests import urllib3 import math import time import datetime from multiprocessing.dummy import Pool import hashlib urllib3.disable_warnings() from logger import Logger from redis_queue import RedisQueue yz = RedisQueue('yz') yz_rec = RedisQueue('yz_rec') logger = Logger() """校验订单规则,每页查找订单,找到符合条件的结束翻页查找""" def check_pay(order_sn, pdduid, kdtsessionid): cookie = 'KDTSESSIONID={}'.format(kdtsessionid) firsr_url = 'https://h5.youzan.com/v2/trade/order/list.json?perpage=20&page=1&type=all' headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36", 'Cookie': cookie } res = requests.get(firsr_url, headers=headers, verify=False) if '页面已被删除' in res.text:
#!/usr/bin/env python # coding:utf-8 # Copyright (C) dirlt from redis_queue import RedisQueue command_queue = RedisQueue('command') command_queue.put('trigger')
import datetime from mysql_db import db_insert from flask import Flask, jsonify, request, redirect, render_template from redis_queue import RedisQueue app = Flask(__name__) # 下单部分 from pdd_spider import pdd_main from yz_spider import yz_main # 查询部分 from pdd_query import pdd_pass_query from yz_query import yz_pass_query pdd = RedisQueue('pdd') yz = RedisQueue('yz') """拼多多下单爬虫""" def pdd_spider(pdduid, accesstoken, goods_url, amount, order_number): result = pdd_main(pdduid, accesstoken, goods_url, amount, order_number) return result """有赞下单爬虫""" def yz_spider(pdduid, kdtsessionid, goods_url, amount, order_number): result = yz_main(pdduid, kdtsessionid, goods_url, amount, order_number)
def add_job_to_queue(country_id, num_records): queue = RedisQueue('jobs') job = {'country_id': country_id, 'num_records': num_records} job_in_json = json.dumps(job) queue.enqueue(job_in_json)
class A3C(object): def __init__(self): self.device = '/gpu:0' if USE_GPU else '/cpu:0' self.stop_requested = False self.global_t = 0 if USE_LSTM: self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1) else: self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device) self.global_network.create_loss(ENTROPY_BETA) self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE) print 'initial_learning_rate:', self.initial_learning_rate self.learning_rate_input = tf.placeholder('float') self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) grads_and_vars = self.optimizer.compute_gradients( self.global_network.total_loss, self.global_network.get_vars()) self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars) self.actor_threads = [] for i in range(PARALLEL_SIZE): actor_thread = A3CActorThread(i, self.global_network) self.actor_threads.append(actor_thread) self.sess = tf.InteractiveSession() self.sess.run(tf.initialize_all_variables()) self.reward_input = tf.placeholder(tf.float32) tf.scalar_summary('reward', self.reward_input) self.time_input = tf.placeholder(tf.float32) tf.scalar_summary('living_time', self.time_input) self.summary_op = tf.merge_all_summaries() self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph) self.saver = tf.train.Saver() self.restore() self.lock = threading.Lock() self.rq = RedisQueue(REDIS_QUEUE_NAME) self.train_count = 0 return def restore(self): checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path: self.saver.restore(self.sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) tokens = checkpoint.model_checkpoint_path.split("-") # set global step self.global_t = int(tokens[1]) print(">>> global step set: ", self.global_t) else: print("Could not find old checkpoint") return def backup(self): if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) self.saver.save(self.sess, CHECKPOINT_DIR + '/' + 'checkpoint', global_step=self.global_t) return def predict_function(self, parallel_index, lock): actor_thread = self.actor_threads[parallel_index] while True: if self.stop_requested or (self.global_t > MAX_TIME_STEP): break diff_global_t = actor_thread.process( self.sess, self.global_t, self.summary_writer, self.summary_op, self.reward_input, self.time_input ) self.global_t += diff_global_t if self.global_t % 1000000 < LOCAL_T_MAX: self.backup() # print 'global_t:', self.global_t return def train_function(self, index, lock): batch_state = [] batch_action = [] batch_td = [] batch_R = [] while True: if self.stop_requested or (self.global_t > MAX_TIME_STEP): break data = self.rq.get() (state, action, td, R) = cPickle.loads(data) batch_state.append(state) batch_action.append(action) batch_td.append(td) batch_R.append(R) if len(batch_R) < BATCH_SIZE: continue lock.acquire() self.sess.run(self.apply_gradients, feed_dict={ self.global_network.state_input: batch_state, self.global_network.action_input: batch_action, self.global_network.td: batch_td, self.global_network.R: batch_R, self.learning_rate_input: self.initial_learning_rate }) self.train_count += 1 lock.release() batch_state = [] batch_action = [] batch_td = [] batch_R = [] print 'train_index:', index, 'train_count:', self.train_count return def signal_handler(self, signal_, frame_): print 'You pressed Ctrl+C !' self.stop_requested = True return def run(self): predict_treads = [] for i in range(PARALLEL_SIZE): predict_treads.append(threading.Thread(target=self.predict_function, args=(i, self.lock))) signal.signal(signal.SIGINT, self.signal_handler) for t in predict_treads: t.start() train_threads = [] for i in range(TRAIN_SIZE): train_threads.append(threading.Thread(target=self.train_function, args=(i, self.lock))) train_threads[i].start() print 'Press Ctrl+C to stop' signal.pause() print 'Now saving data....' for t in predict_treads: t.join() for t in train_threads: t.join() self.backup() return
class Zhihu_crawler(): def __init__(self, url): self.queue = RedisQueue('zhihu', host='localhost', port=6379, db=0) self.url = url self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" , "Host":"www.zhihu.com" , "Refer":"www.zhihu.com" , "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4" , "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" , "Accept-Encoding":"gzip, deflate, sdch, br" , "Cache-Control":"max-age=0" , "Connection":"keep-alive"} #cookie self.cookies={"_zap":"aaf2a75d-0a1b-4863-b8a0-23ff0f4a9002" , "_za":"e73a8db5-0824-4c36-b6a2-7a5378a046f7" , "udid":'"AFAAY31blAmPTta9QIqu7S6lUdEK97RWDgg=|1457941793"' , "d_c0":'"AGBAzqyTowmPTpYh7UrYZSjcr43LFX006Tw=|1461248461"' , "_zap":"267bc327-098d-4d7c-85cb-3cfd13cd2e8e" , "q_c1":"3b3a3dccecf1499ea32a0b2da9be35ec|1470149980000|1445741536000" , "_xsrf":"8a812fd7745e54a8e8ab4ed815fa9001" , "l_cap_id":'"YzQ3YzNhNzUxZjBlNDAzNTgwM2FhNzdlODI5NjAxZjY=|1472298711|d67a5a1c7e5fb41cfe2715e389c74ebc6132007d"' , "cap_id":'"ZGQwYTE0MTM3ODk0NDUzOGFkM2RiNGYxYTNmYTc1YTM=|1472298711|8fd9f406e4786a9ca56227b61e7c6a2a5c0f4b42"' , "login":'******' , "n_c":'1' , "s-t":"autocomplete" , "s-q":"volley%2Cretrofit%2Cokhttp" , "s-i":"1" , "sid":"6vahoruo" , "a_t":'"2.0AEAAukjbcgoXAAAATjPpVwBAALpI23IKAGBAzqyTowkXAAAAYQJVTfYL6VcAoZ3PJyuvTIR4Yl3RS9B_tCnMwHxnX7iDfjl2Ve7xk-Nk6RdV68h4_A=="' , "z_c0":"Mi4wQUVBQXVramJjZ29BWUVET3JKT2pDUmNBQUFCaEFsVk45Z3ZwVndDaG5jOG5LNjlNaEhoaVhkRkwwSC0wS2N6QWZB|1472308814|21bb41cc3844239f4582374fc850ced4a5e8c564" , "__utma":"51854390.226515891.1472287250.1472298703.1472307196.4" , "__utmc":"51854390" , "__utmz":"51854390.1472296126.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)" , "__utmv":"51854390.100--|2=registration_date=20160827=1^3=entry_date=20151025=1"} def send_request(self): #关注者的url followees_url = self.url + '/followees' session = requests.session() session.proxies = { "http": "http://124.88.67.17.251:8685", "https": "http://223.67.136.218:8920", } #发起请求 #避免Https的证书验证 r = requests.get(followees_url, cookies = self.cookies, headers = self.headers, verify = True) try: r.raise_for_status() except requests.HTTPError as e: print e.message + ' HttpError' except requests.ConnectionError as e: print e.message content = r.text if r.status_code == requests.codes.ok: self.parse_users_content(content) print "requests success!" #判断是否数据存在 def judge_data_have(self, name, datas): if datas: #print datas[0] return datas[0] else: #print name + " not exist!" return '' #解析数据 def parse_users_content(self, html_source): #初始化我们需要的信息变量 self.user_name='' self.user_gender='' self.user_location='' self.user_followees='' self.user_followers='' self.user_be_agreed='' self.user_be_thanked='' self.user_education_school='' self.user_education_subject='' self.user_employment='' self.user_employment_extra='' self.user_intro='' self.followees_urls='' tree = etree.HTML(html_source) self.user_name = self.judge_data_have("姓名", tree.xpath('//a[@class = "name"]/text()')) self.user_location = self.judge_data_have("位置", tree.xpath('//span[@class = "location item"]/@title')) self.user_gender = self.judge_data_have("性别", tree.xpath('//span[@class = "item gender"]/i/@class')) if self.user_gender: if 'female' in self.user_gender: self.user_gender = 'female' elif 'male' in self.user_gender: self.user_gender = 'male' followees = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()') if followees: self.user_followees = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()')[0] self.user_followers = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()')[1] stats = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()') if stats: self.user_be_agreed = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()')[0] self.user_be_thanked = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()')[1] self.user_education_school = self.judge_data_have("学校", tree.xpath('//span[@class = "education item"]/a/@title')) self.user_education_subject = self.judge_data_have("学科", tree.xpath('//span[@class = "education-extra item"]/a/@title')) self.user_employment = self.judge_data_have("公司", tree.xpath('//span[@class = "employment item"]/@title')) self.user_employment_extra = self.judge_data_have("公司", tree.xpath('//span[@class = "position item"]/@title')) self.user_intro = self.judge_data_have("简介", tree.xpath('//div[@class = "bio ellipsis"]/@title')) #添加到队列里面 self.followees_urls = tree.xpath('//a[@class = "zg-link author-link"]/@href') for url in self.followees_urls: #url = url.replace("https", "http") self.queue.put(url) self.print_data_out() #打印最终信息 def print_data_out(self): print "*"*60 print "用户名:%s".decode('utf-8') % self.user_name print "用户性别:%s".decode('utf-8') % self.user_gender print "用户地址:%s".decode('utf-8') % self.user_location print "被同意:%s".decode('utf-8') % self.user_be_agreed print "被感谢:%s".decode('utf-8') % self.user_be_thanked print "被关注:%s".decode('utf-8') % self.user_followers print "关注了:%s".decode('utf-8') % self.user_followees print "工作:%s/%s".decode('utf-8') % (self.user_employment,self.user_employment_extra) print "教育:%s/%s".decode('utf-8') % (self.user_education_school,self.user_education_subject) print "用户信息:%s".decode('utf-8') % self.user_intro print "*"*60 self.save_in_mongodb() #存储到mongodb数据库里面 def save_in_mongodb(self): new_data = Zhihu_User_Data( user_name = self.user_name, user_gender = self.user_gender, user_location = self.user_location, user_followees = self.user_followees, user_followers = self.user_followers, user_be_agreed = self.user_be_agreed, user_be_thanked = self.user_be_thanked, user_education_school = self.user_education_school, user_education_subject = self.user_education_subject, user_employment = self.user_employment, user_employment_extra = self.user_employment_extra, user_intro = self.user_intro, followees_urls = self.followees_urls ) new_data.save() #返回队列 def get_queue(self): return self.queue
def push_to_queue(queue_name, items): queue = RedisQueue(queue_name) for item in items: queue.put(item)