def loaddata(c_thread, thread_num, interval): log_name_title = "tencent_wb_msg_" ip = get_ip() base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info("run......") driver = qq_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #出队 conn_redis = redis_connect() conn_mongo = connect_mongodb() if conn_redis == 0 or conn_mongo == 0: log.info("redis or mongodb connect error") else: log.info("connect redis ok") log.info("connect mongodb ok") while not c_thread.thread_stop: current_date = time.strftime("%Y%m%d", time.localtime()) if current_date == base_date: pass else: base_date = current_date log = log_setting(log_name_title + base_date + ".log") log.info('Thread:(%s)' % (thread_num)) url = pop_redis_list(conn_redis) #判断队列是否为空 if url == None: log.info("msg queue is NULL") break else: #获取详细信息 msg = get_msg(driver, url, log) # print "load to mongodb" try: load_mongodb(conn_mongo, url, msg) except: rtx('ip', ip + "机器mongodb失败") log.info('ip' + ip + "机器mongodb失败") log.info("mongodb error") break # rtx('IP','正常停止') log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = ThreadMsg.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def mysql_connect(): try: mysql_conn = MySQLdb.connect("192.168.8.25", "qzone_spider", "qzone_spider", "db_tencent_wb") except: print "connect mysql error" rtx('IP', 'mysql连接异常') return None return mysql_conn
def mysql_connect_local_qq(): try: mysql_conn = MySQLdb.connect("localhost", "qzone_spider", "qzone_spider", "db_tencent_qzone") except: print "connect mysql error" rtx('IP', 'mysql连接异常') return None return mysql_conn
def redis_connect(): #带密码连接 # r = redis.StrictRedis(host='localhost', port=6379, password='******') try: redis_conn = redis.Redis(host='192.168.15.111', port=6379, db=0) except: rtx('IP', 'redis连接异常') print "connect redis error" return None return redis_conn
def connect_mongodb(): #新版本连接方式 try: conn = MongoClient("192.168.15.111", 27017) except: conn = 0 rtx('IP','mongodb连接异常') #旧版本连接方式 # conn = pymongo.Connection("192.168.15.111",27017) return conn
def control_thread(request): th_name = request.POST['id'] control = request.POST['control'] print "thread_name is ", th_name #显示活跃状态 msg_active = True thread = ThreadMsg.objects.get(thread_name=th_name) if control == 'start': rtx('ip', '进程' + str(th_name) + ' 开始采集标签信息') #状态信息 # thread1_status = True c = ThreadControl() # status = 1 #出现错误,则线程不存在,因此启动线程 try: status = c.is_alive(th_name) print "thread is alive? ", status if status: print "thread is alive,caonot start twice!" else: print "start ..........thread1" c.start(th_name, 1) except: print "thread is not alive start!!!" c.start(th_name, 1) thread.thread_status = 1 thread.save() if control == 'stop': # thread1_status = False # status = 0 rtx('ip', '进程' + str(th_name) + ' 采集标签信息即将停止') c = ThreadControl() try: c.stop(th_name) thread.thread_status = 0 thread.save() except: print "not thread alive" IP = get_ip() thread_list = ThreadMsg.objects.filter(thread_ip=IP) return render_to_response( 'index.html', { "thread_name": th_name, "control": control, "thread_list": thread_list, "msg_active": msg_active })
def loaddata(c_thread, thread_num, interval): log_name_title = str(thread_num) + "_tencent_qzone_info_" ip = get_ip() base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info(thread_num + "run......") driver = qzone_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #出队 conn_redis = redis_connect() conn_mongo = connect_mongodb() # print "conn_redis",conn_redis # print "conn_mongo",conn_mongo #定义pop的redis名字 redis_list_pop_name = "tencent_qzone_qq_info" redis_list_push_qzone_forbid_name = "tencent_qzone_forbid_qq" if conn_redis == 0 or conn_mongo == 0: log.info("redis or mongodb connect error") else: log.info("connect redis ok") log.info("connect mongodb ok") ip = get_ip() while not c_thread.thread_stop: current_date = time.strftime("%Y%m%d", time.localtime()) if current_date == base_date: pass else: base_date = current_date log = log_setting(log_name_title + base_date + ".log") print 'Thread:(%s) Time:%s\n' % (thread_num, time.ctime()) # log = log_setting() #pop_redis_list(redis_conn,redis_list_name) qq = pop_redis_list(conn_redis, redis_list_pop_name) log.info('Thread:(%s) QQ:%s' % (thread_num, qq)) #判断队列是否为空 if qq == None: log.info("queue is NULL") break else: #获取详细信息 url = "http://user.qzone.qq.com/" + str(qq) + "/profile" info_list = get_info(driver, url, log) # print info_list # msg = get_msg(driver,url) if info_list == 0: #qq放入redis消息队列 push_redis_list_tmp(conn_redis, redis_list_push_qzone_forbid_name, qq) log.info(qq + "请求失败,入队禁止访问消息队列") driver = qzone_login() pass elif info_list == 1: #qq放入redis消息队列 push_redis_list_tmp(conn_redis, redis_list_push_qzone_forbid_name, qq) log.info(qq + "入队禁止访问消息队列") pass else: #存入mongodb log.info("load to mongodb") try: load_mongodb_qzone_info(conn_mongo, qq, info_list) except: rtx('ip', ip + "机器mongodb失败") log.info('ip' + ip + "机器mongodb失败") log.info("mongodb error") break # rtx('IP','正常停止') log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = ThreadQzoneInfo.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def loaddata(c_thread, thread_num, interval): print "run......" driver = qzone_login() time.sleep(3) if driver == None: "phantomjs error!quit" return 0 else: pass #连接redis conn_redis = redis_connect() redis_list_name_pop = "tencent_qzone_qq_test" redis_list_name_push = "tencent_qzone_qq_tmp_test" print "conn_redis", conn_redis if conn_redis == None: print "redis connect error" else: while not c_thread.thread_stop: print 'qzone_qq_friend Thread:(%s) Time:%s\n' % (thread_num, time.ctime()) qq = pop_redis_list(conn_redis, redis_list_name_pop) if qq == None: print "queue is NULL" break else: url = "http://user.qzone.qq.com/" + qq + "/mood" print "url", url driver.get(url) try: #等待页面加载完成 frame_element = WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.ID, "app_canvas_frame"))) print "find frame id" driver.switch_to.frame('app_canvas_frame') try: #等待切换后的元素存在 class_name_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "comments_content"))) print "find conment" html = driver.page_source soup = BeautifulSoup(html) print "======" my_set = set() for i in soup.find_all(class_='comments_content'): friend_qq = str(i.find('a')['href'])[25:-6] print friend_qq if friend_qq != qq: my_set.add(friend_qq) print my_set friend_qq_list = list(my_set) print friend_qq_list except: print "not found conment" friend_qq_list = ['0'] except: print "没有权限访问" friend_qq_list = ['-1'] print friend_qq_list #############################################存入mysql print "insert mysql" #获取qq和friend_qq组成的元组,多个 tmp_tuple = get_tuple(qq, friend_qq_list) #插入mysql数据库 print "insert into table " mysql_conn = mysql_connect_local_qq() insert_mysql_qq(mysql_conn, tmp_tuple) #关闭数据库 mysql_conn.close() ############################################存入临时的redis print "put mid redis" push_redis_list_tmp(conn_redis, redis_list_name_push, qq) print "put auditor mid redis" for friend_qq in friend_qq_list: push_redis_list_tmp(conn_redis, redis_list_name_push, friend_qq) print thread_num, "quit phantomjs" driver.quit() #rtx提醒 ip = get_ip() rtx('ip', ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 print "更新数据库线程状态" thread = Thread_qq_friend.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def loaddata(c_thread, thread_num, interval): log_name_title = "tencent_wb_auditor_" base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info(thread_num + "run......") driver = qq_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #连接redis conn_redis = redis_connect() #mysql连接 异常返回None #mysql_conn = mysql_connect() # conn_mongo = connect_mongodb() # print "conn_mongo",conn_mongo if conn_redis == None: log.info("redis connect error") else: log.info("connect redis ok") ip = get_ip() while not c_thread.thread_stop: current_date = time.strftime("%Y%m%d", time.localtime()) if current_date == base_date: pass else: base_date = current_date log = log_setting(log_name_title + base_date + ".log") # log.info('Thread:(%s) Time:%s'%(thread_num,time.ctime())) log.info('Thread:(%s)' % (thread_num)) mid = pop_redis_list(conn_redis) if mid == None: log.info("queue is NULL") break else: url = "http://t.qq.com/" + str(mid) log.info("url is: " + url) time.sleep(3) #根据用户的主页url获取收听的所有页面 auditor_page_url_list = get_auditor_page_url_via_url( driver, url) if auditor_page_url_list == None: log.info("page is not personal,login again") driver.quit() driver = qq_login() if driver == None: break else: pass #根据收听的所有页面获取收听者的主页url ################根据已知mid获取所有收听的mid else: mid_list = get_auditor_main_url(driver, auditor_page_url_list) if mid_list == None: continue else: #############################################存入mysql try: log.info("insert mysql") #获取mid和auditor_mid组成的元组,多个 tmp_tuple = get_tuple(mid, mid_list) #插入mysql数据库 print "insert into table " mysql_conn = mysql_connect() insert_mysql(mysql_conn, tmp_tuple) #关闭数据库 mysql_conn.close() except: rtx('ip', ip + "机器mysql出错") log.info('ip' + ip + "机器mysql出错") log.info("insert mysql error") ############################################存入临时的redis try: log.info("put mid redis") push_redis_list_tmp(conn_redis, mid) log.info("put auditor mid redis") for auditor_mid in mid_list: push_redis_list_tmp(conn_redis, auditor_mid) except: rtx('ip', ip + "机器redis出错") log.info('ip' + ip + "机器redis出错") log.info("insert redis error") log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = Threadauditor.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()
def qq_login(): USER_COUNT = TencentUser.objects.count() PROXY_COUNT = TencentProxy.objects.count() #产生随机数 print 'USER_COUNT', USER_COUNT print 'PROXY_COUNT', PROXY_COUNT user_number = random.randint(1, USER_COUNT) #判断是否有代理 if PROXY_COUNT == 0: proxy_status = False else: proxy_number = random.randint(1, PROXY_COUNT) print "proxy_number",proxy_number proxy_object = TencentProxy.objects.get(proxy_id=proxy_number) #proxy_ip = '110.73.6.15:8123' proxy_ip = proxy_object.proxy_ip proxy = '--proxy=' + proxy_ip service_args = [proxy] proxy_status = True print "proxy",proxy print 'user_number is',user_number print "proxy_status",proxy_status #去数据库中取,随机获取登陆帐号 user = TencentUser.objects.get(user_id=user_number) login_name = user.login_name login_pwd = user.login_password tencent_wb_name = user.tencent_wb_name flag = 1 count = 0 while flag: try: ###################linux driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs-2.1.1-linux-x86_64/bin/phantomjs') ###################windows # if proxy_status: # print "use proxy" # driver = webdriver.PhantomJS(executable_path='E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs',service_args=service_args) # else: # print "no proxy" # driver = webdriver.PhantomJS(executable_path='E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs') flag = 0 except: print "PhantomJS error,wait a moment!" time.sleep(2) count = count + 1 if count > 5: rtx('IP','连接phantomjs失败,检查phantomjs是否可用') return None try: print "start get main" driver.get("http://t.qq.com/") print "get over" time.sleep(3) # driver.switch_to_frame("login_frame") driver.switch_to.frame("login_div") driver.find_element_by_id("switcher_plogin").click() driver.find_element_by_id("u").send_keys(login_name) driver.find_element_by_id("p").send_keys(login_pwd) driver.find_element_by_id("login_button").click() time.sleep(10) print "driver.current_url is",driver.current_url #判断登陆成功 if driver.current_url == str("http://t.qq.com/" + tencent_wb_name): pass else: print "url not match!" driver.quit() qq_login() except: print "login error!" rtx('IP','登陆异常,检查帐密或者代理是否可用') #代理访问出错 driver.quit() qq_login() return driver
def qzone_login(): USER_COUNT = TencentUser.objects.count() PROXY_COUNT = TencentProxy.objects.count() user_number = random.randint(1, USER_COUNT) #去数据库中取,随机获取登陆帐号 user = TencentUser.objects.get(user_id=user_number) login_name = user.login_name #密码解密s2 = base64.decodestring(s1) login_pwd = base64.decodestring(user.login_password) qq_qzone_name = user.qq_qzone_name login_flag = 1 login_times = 1 while login_flag: driver_flag = 1 driver_times = 1 while driver_flag: try: #driver = webdriver.PhantomJS('E:\\phantomjs\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs') driver = webdriver.PhantomJS( executable_path='/usr/local/phantomjs/bin/phantomjs', service_log_path='/data/tmp/ghostdriver.log') driver_flag = 0 except: print "PhantomJS error,wait a moment!" time.sleep(10) driver_times = driver_times + 1 if driver_times > 5: driver_flag = 0 try: driver.get("http://i.qq.com/") time.sleep(3) # driver.switch_to_frame("login_frame") driver.switch_to.frame("login_frame") driver.find_element_by_id("switcher_plogin").click() driver.find_element_by_id("u").send_keys(login_name) driver.find_element_by_id("p").send_keys(login_pwd) driver.find_element_by_id("login_button").click() time.sleep(10) print "driver.current_url is", driver.current_url print "match is : ", "http://user.qzone.qq.com/" + str( qq_qzone_name) if driver.current_url == "http://user.qzone.qq.com/" + str( qq_qzone_name ) or driver.current_url == "https://user.qzone.qq.com/" + str( qq_qzone_name): login_flag = 0 else: print "url 不一致!" driver.quit() except: print "login error!" driver.quit() login_times = login_times + 1 if login_times > 10: rtx('ip', 'qq login error') driver = None login_flag = 0 return driver
def loaddata(c_thread, thread_num, interval): log_name_title = "tencent_wb_auditor_" base_date = time.strftime("%Y%m%d", time.localtime()) log = log_setting(log_name_title + base_date + ".log") log.info(thread_num + "run......") driver = qzone_login() time.sleep(3) if driver == None: log.info("phantomjs error!quit") return 0 else: pass #连接redis conn_redis = redis_connect() redis_list_name_pop = "tencent_qzone_qq" redis_list_name_push = "tencent_qzone_qq_transfer" print "conn_redis", conn_redis if conn_redis == None: log.info("redis connect error") else: log.info("redis connect ok") ip = get_ip() while not c_thread.thread_stop: log.info('qzone_qq_friend Thread:(%s) Time:%s' % (thread_num, time.ctime())) qq = pop_redis_list(conn_redis, redis_list_name_pop) if qq == None: log.info("queue is NULL") break else: url = "http://user.qzone.qq.com/" + qq + "/mood" log.info("url" + url) driver.get(url) try: #等待页面加载完成 frame_element = WebDriverWait(driver, 3).until( EC.presence_of_element_located( (By.ID, "app_canvas_frame"))) log.info("find frame id") driver.switch_to.frame('app_canvas_frame') try: #等待切换后的元素存在 class_name_element = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, "comments_content"))) log.info("find conment") html = driver.page_source soup = BeautifulSoup(html) print "======" my_set = set() for i in soup.find_all(class_='comments_content'): friend_qq = str(i.find('a')['href'])[25:-6] print friend_qq if friend_qq != qq: my_set.add(friend_qq) print my_set friend_qq_list = list(my_set) print friend_qq_list except: log.info("not found conment") friend_qq_list = ['0'] except: log.info("没有权限访问") friend_qq_list = ['-1'] print friend_qq_list #############################################存入mysql try: log.info("insert mysql") #获取qq和friend_qq组成的元组,多个 tmp_tuple = get_tuple(qq, friend_qq_list) #插入mysql数据库 print "insert into table " mysql_conn = mysql_connect_qq() insert_mysql_qq(mysql_conn, tmp_tuple) #关闭数据库 mysql_conn.close() except: rtx('ip', ip + "机器QQ空间关系链采集mysql出错") log.info('ip' + ip + "机器QQ空间关系链采集mysql出错") ############################################存入临时的redis try: log.info("put mid redis") push_redis_list_tmp(conn_redis, redis_list_name_push, qq) log.info("put auditor mid redis") for friend_qq in friend_qq_list: push_redis_list_tmp(conn_redis, redis_list_name_push, friend_qq) except: rtx('ip', ip + "机器QQ空间关系链采集redis入队出错") log.info('ip' + ip + "机器QQ空间关系链采集redis入队出错") log.info(thread_num + "quit phantomjs") driver.quit() #rtx提醒 rtx('ip', ip + "机器" + thread_num + "停止运行") log.info('ip' + ip + "机器" + thread_num + "停止运行") #数据库状态更新,根据线程名称 log.info("更新数据库线程状态") thread = Thread_qq_friend.objects.get(thread_name=thread_num) thread.thread_status = 0 thread.save()