Exemplo n.º 1
0
 def updateUserInfo(self, user_id, dict):
     if not dict:
         return
     try:
         self.openMySql()
         name = dict.get('name')
         image = dict.get('image', "")
         sex = dict.get('sex')
         sign = dict.get('sign', '')
         location = dict.get('location', '')
         major = dict.get('major', '')
         job = dict.get('job', '')
         education = dict.get('education', '')
         info = dict.get('info', '')
         url = dict.get('url', '')
         #解决特殊字符的问题
         info = self.filterString(info)
         # update user u set u.name='轮子哥2',u.image='www.baidu.com',u.sex='男',u.sign='个性签名',u.location='西雅图',u.major='软件',u.job='谷歌',u.education='xx大学',u.info='个人信息',u.is_catch=1 where u.user_id='vch2'
         sql = "update user u set u.name='{0}',u.image='{1}',u.sex='{2}',u.sign='{3}',u.location='{4}',u.major='{5}',u.job='{6}',u.education='{7}',u.info='{8}',url_following='{9}',u.is_catch=1 where u.user_id='{10}'".format(
             name, image, sex, sign, location, major, job, education, info,
             url, user_id)
         cursor = self.conn.cursor()
         cursor.execute(sql)
         self.conn.commit()
         log('更新用户信息到数据库成功,user_id=%s' % user_id)
     except Exception as e:
         loge(e)
     finally:
         cursor.close()
         self.closeMySql()
Exemplo n.º 2
0
 def saveAchieveInfo(self, user_id, achieveDict):
     try:
         self.openMySql()
         record_num = achieveDict.get('record_num', 0)
         record_by = achieveDict.get('record_by', '')
         applaud_num = achieveDict.get('applaud_num', 0)
         gratitude_num = achieveDict.get('gratitude_num', 0)
         collect_num = achieveDict.get('collect_num', 0)
         public_edit_num = achieveDict.get('public_edit_num', 0)
         is_excellent_answer = 1 if achieveDict.get('is_excellent_answer',
                                                    False) else 0
         excellent_topic = achieveDict.get('excellent_topic', '')
         #('asd2',10,'编辑推荐',2,3,5,6,1,'生活话题')
         sql = "insert into achieve(user_id,record_num,record_by,applaud_num,gratitude_num,collect_num,public_edit_num,is_excellent_answer,excellent_topic) values('{0}',{1},'{2}',{3},{4},{5},{6},{7},'{8}')".format(
             user_id, record_num, record_by, applaud_num, gratitude_num,
             collect_num, public_edit_num, is_excellent_answer,
             excellent_topic)
         cursor = self.conn.cursor()
         cursor.execute(sql)
         self.conn.commit()
         log('保存用户个人成就到数据库成功,user_id = {0}'.format(user_id))
     except Exception as e:
         loge(e)
     finally:
         cursor.close()
         self.closeMySql()
Exemplo n.º 3
0
 def start(self):
     t1 = threading.Thread(target=self.catchUserInfoThread)
     t2 = threading.Thread(target=self.catchUserFollowingThread)
     t3 = threading.Thread(target=self.exitThread)
     t1.start()
     t2.start()
     t3.start()
     t1.join()
     t2.join()
     t3.join()
     log("全部程序运行完毕")
Exemplo n.º 4
0
 def getFirstUserToFollowing(self):
     result = None
     try:
         self.openMySql()
         sql = 'select * from user where is_following=0 order by id'
         cursor = self.conn.cursor()
         cursor.execute(sql)
         res = cursor.fetchone()
         result = res[1]
     except Exception as e:
         loge(e)
     finally:
         cursor.close()
         self.closeMySql()
     log('获取第一个没有爬取关注者的用户, user_id={0}'.format(result))
     return result
Exemplo n.º 5
0
 def getFirstUserToCatch(self):
     result = None
     try:
         self.openMySql()
         sql = "select * from user where is_catch=0 order by id"
         cursor = self.conn.cursor()
         cursor.execute(sql)
         res = cursor.fetchone()
         if res:
             result = res[1]
     except Exception as e:
         loge(e)
     finally:
         cursor.close()
         self.closeMySql()
     log('获取第一个没有被抓取的用户,user_id=%s' % result)
     return result
Exemplo n.º 6
0
 def start(self):
     #创建线程
     spiderThreads = []
     followingThread = threading.Thread(
         target=self.catchUserFollowingThread)
     exitThread = threading.Thread(target=self.exitThread)
     spiderThreads.append(followingThread)
     spiderThreads.append(exitThread)
     lock = threading.Lock()
     for i in range(0, self.thread_num):
         th = threading.Thread(target=self.catchUserInfoThread,
                               args=(lock, ))
         spiderThreads.append(th)
     #启动线程
     [th.start() for th in spiderThreads]
     #等待线程结束
     [th.join() for th in spiderThreads]
     log('所有程序运行完毕')
Exemplo n.º 7
0
 def getUserInfo(self, userId):
     dict = {}
     # 构造用户信息页面的url
     url = base_url.format(userId)
     dict['user_id'] = userId
     dict['url'] = url
     dict['code'] = self.code_success
     count = 0
     while count < 3:
         driver = None
         try:
             driver = webdriver.PhantomJS(
                 executable_path=spider_const.phantomjs_path,
                 desired_capabilities=spider_const.desired_cap)
             driver.implicitly_wait(self.time_wait)
             driver.get(url)
             # 保存图片
             # dt = datetime.now()
             # fileName = dt.strftime('%Y-%m-%d_%H-%M-%S') + ".jpg"
             # driver.save_screenshot(fileName)
             error = driver.page_source.find('你似乎来到了没有知识存在的荒原...')
             # 404界面
             if error != -1:
                 dict['code'] = self.code_user_not_exist
             else:
                 elem = driver.find_element_by_class_name(
                     'ProfileHeader-expandButton')
                 elem.send_keys(Keys.ENTER)
                 #解析用户信息
                 dictResult = self.parseUserInfo(driver.page_source)
                 #解析用户个人成就
                 log('开始抓取用户个人成就,user_id = {0}'.format(userId))
                 dictAchieve = self.parseAchieve(driver.page_source)
                 dict.update(dictResult)
                 dict.update(dictAchieve)
             break
         except Exception as e:
             loge(e)
             count = count + 1
             log('发生异常,尝试第{0}次重试, user_id={1}'.format(count, userId))
         finally:
             if driver:
                 driver.quit()
             log('进入{0}秒休眠'.format(self.time_duration))
             time.sleep(self.time_duration)
             log('{0}秒休眠结束'.format(self.time_duration))
     # 尝试次数超过3次,那么认为抓取失败
     if count >= 3:
         dict['code'] = self.code_failure
     return dict
Exemplo n.º 8
0
 def saveFollowerInfo(self, user_id, follower_list):
     if not user_id or not follower_list:
         return
     self.openMySql()
     cursor = self.conn.cursor()
     for item in follower_list:
         try:
             sql = "insert into follow(user_id,follower_id) values('%s','%s')" % (
                 user_id, item)
             cursor.execute(sql)
             self.conn.commit()
         except Exception as e:
             log('保存用户关注信息,插入follow表发生异常,user_id = {0}'.format(user_id))
             loge(e)
     for item in follower_list:
         try:
             sql = "insert into user(user_id) values('%s')" % item
             cursor.execute(sql)
             self.conn.commit()
         except Exception as e:
             pass
     cursor.close()
     self.closeMySql()
Exemplo n.º 9
0
 def exitThread(self):
     log('检测是否退出的线程启动')
     while True:
         file = spider_const.control_exit_file
         if os.path.exists(file):
             self.isExit = True
             log('检测到退出文件,退出程序.exit_file = {0}'.format(file))
             break
         else:
             duration = spider_const.control_exit_duration * 60
             log('未检测到退出文件,休眠{0}秒'.format(duration))
             time.sleep(duration)
Exemplo n.º 10
0
 def parseAchieve(self, content):
     dict = {}
     if content is None:
         return dict
     p = pq(content)
     card = p('div.Profile-sideColumnItem')
     for item in card.items():
         pTitle = item('div.IconGraf')
         title = pTitle.text()
         if title == '优秀回答者':
             topic = item('div.Profile-sideColumnItemValue').text()
             dict['is_excellent_answer'] = True
             dict['excellent_topic'] = topic
             log('优秀回答者:topic=' + topic)
         elif title[:4] == '知乎收录':
             record_num = re.sub('\D', "", title)
             record_by = item('div.Profile-sideColumnItemValue').text()
             dict['record_num'] = record_num
             dict['record_by'] = record_by
             log('知乎收录{0}个答案, {1}'.format(record_num, record_by))
         elif title[:2] == '获得':
             # 获得xx次赞同
             applaud_num = re.sub('\D', '', title)
             itemContent = item('div.Profile-sideColumnItemValue').text()
             # 获得感谢的次数
             pattern = re.compile(r'获得\s?(\d+)\s?次感谢')
             result = re.search(pattern, itemContent)
             gratitude_num = result.groups()[0] if result else 0
             # 获得收藏的次数
             pattern2 = re.compile(r'(\d+)\s?次收藏')
             result2 = re.search(pattern2, itemContent)
             collect_num = result2.groups()[0] if result2 else 0
             dict['applaud_num'] = applaud_num
             dict['gratitude_num'] = gratitude_num
             dict['collect_num'] = collect_num
             log('获得{0}次称赞,{1}次感谢,{2}次收藏'.format(applaud_num, gratitude_num,
                                                 collect_num))
         elif title[:2] == '参与':
             public_edit_num = re.sub('\D', '', title)
             dict['public_edit_num'] = public_edit_num
             log('参与{0}次公共编辑'.format(public_edit_num))
     return dict
Exemplo n.º 11
0
 def catchUserFollowingThread(self):
     s = ZhiHuSpider()
     d = DBUtil()
     st = Status.Following()
     while self.isExit == False:
         #取出第一个用户
         userId, currentPage = d.getFirstUserToFollowing2()
         log('开始抓取用户关注者,user_id={0}, current_page={1}'.format(
             userId, currentPage))
         if userId is None:
             time.sleep(3)
             continue
         d.setUserIsFollowing(userId, st.is_catching)
         #获取关注者页数
         total = self.getUserFollowingPageNum(userId)
         log('当前用户总的关注者的页数,user_id={0}, total_page={1}'.format(
             userId, total))
         #用户没有关注任何人
         if total == 0:
             d.setUserIsFollowing(userId, st.user_following_none)
             continue
         #标识是否正常退出
         isFinished = True
         for i in range(currentPage + 1, total + 1):
             # 判断是否要退出
             if self.isExit:
                 isFinished = False
                 break
             list = self.getUserFollowingPageContent(userId, i)
             #获取关注者成功
             if len(list) > 0:
                 d.saveFollowerInfo(userId, list)
                 #设置状态
                 d.setUserIsFollowing(userId, st.is_catching)
             #设置这一页抓取完毕了
             d.setUserFollowingPage(userId, i)
             log('抓取完一页用户的关注者,user_id={0}, page={1}, list.size={2}'.format(
                 userId, i, len(list)))
             time.sleep(self.time_duration * 20)
         # 全部抓取成功
         if isFinished:
             # 设置抓取完毕
             d.setUserIsFollowing(userId, st.catched)
             log('当前用户关注的人全部抓取完毕,user_id= %s' % userId)
         # 没有抓取完毕
         else:
             log('当前用户关注的人没有抓取完毕,中途退出,user_id = {0}'.format(userId))
     log('获取用户关注者的线程运行结束')
Exemplo n.º 12
0
 def catchUserInfoThread(self, lock):
     s = ZhiHuSpider()
     db = DBUtil()
     st = Status.Catch()
     while not self.isExit:
         #加锁
         lock.acquire()
         #获取第一个用户开始爬
         userId = db.getFirstUserToCatch()
         if userId is None:
             lock.release()
             time.sleep(5)
             continue
         #设置为正在爬取
         db.setUserIsCatch(userId, st.is_catching)
         lock.release()
         log('开始爬取用户,pid={0}, user_id={1}'.format(os.getpid(), userId))
         #开始爬取用户信息
         dict = s.getUserInfo(userId)
         code = dict['code']
         # 用户没有价值
         if code == s.code_user_not_useful:
             log('用户没有价值,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.user_not_useful)
         # 用户不存在
         elif code == s.code_user_not_exist:
             log('用户不存在,是僵尸粉,pid={0}, user_id={1}'.format(
                 os.getpid(), userId))
             db.setUserIsCatch(userId, st.user_not_exist)
         # 抓取失败
         elif code == s.code_failure:
             log('用户抓取失败,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.setUserIsCatch(userId, st.failed)
         # 抓取成功
         else:
             log('用户抓取成功,pid={0}, user_id={1}'.format(os.getpid(), userId))
             db.updateUserInfo(userId, dict)
             db.saveAchieveInfo(userId, dict)
     log('获取用户详细信息的线程结束,tid = {0}'.format(self.getThreadId()))
Exemplo n.º 13
0
 def testProess(self, lock):
     while True:
         log('count={0},pid={1}'.format(self.count, os.getpid()))
         time.sleep(3)