예제 #1
0
    def run(self):
        if log.isEnabledFor(logging.DEBUG):
            log.debug("邮件服务线程启动")
        try:
            while True:
                time.sleep(SMTP_SEND_INTERVAL)

                # 准备发送的内容
                msg = MIMEText(self.get_email_content(), 'plain', 'utf-8')
                msg['from'] = SMTP_FROM_ADDR
                msg['to'] = SMTP_TO_ADDR
                msg['Subject'] = Header(self.get_email_header(),
                                        'utf-8').encode()

                # 发送
                smtp_server = smtplib.SMTP(SMTP_SERVER_HOST, SMTP_SERVER_PORT)
                smtp_server.login(SMTP_FROM_ADDR, SMTP_SERVER_PASSWORD)
                smtp_server.sendmail(SMTP_FROM_ADDR, [SMTP_TO_ADDR],
                                     msg.as_string())
                smtp_server.quit()

                # 更新最后一次发送时间
                self.lastSendTime = datetime.datetime.now()

        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
            self.status = 'error'
예제 #2
0
    def run(self):
        try:
            while True:
                raw_data = None
                token = None
                thread_name = None

                data = self.cache_queue.get_data_from_user_list_cache_queue()
                if QUEUE_ELEM_HTML in data:
                    raw_data = data[QUEUE_ELEM_HTML]
                if QUEUE_ELEM_TOKEN in data:
                    token = data[QUEUE_ELEM_TOKEN]
                if QUEUE_ELEM_THREAD_NAME in data:
                    thread_name = data[QUEUE_ELEM_THREAD_NAME]

                if raw_data is not None and token is not None:
                    token_list = self.parse_user_list(raw_data, token)
                    if token_list is not None:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('[' + thread_name + ']开始分析用户“' + token +
                                      '”的关注列表')
                        self.user_token_cache_queue.add_token_into_cache_queue(
                            token_list)
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
            self.status = 'error'
예제 #3
0
 def run(self):
     if log.isEnabledFor(logging.DEBUG):
         log.debug('用户信息抓取线程' + self.thread_name + '启动')
     try:
         self.user_info_scrape()
     except Exception as e:
         if log.isEnabledFor(logging.ERROR):
             log.exception(e)
         self.status = 'error'
예제 #4
0
    def validate_proxy_ip(self, proxy_ip_info):
        if proxy_ip_info is None:
            return False

        # 构造代理信息
        proxy_ip = proxy_ip_info[proxyCore.PROXY_IP]
        proxy_port = proxy_ip_info[proxyCore.PROXY_PORT]
        proxy_protocol = proxy_ip_info[proxyCore.PROXY_PROTOCOL].lower()
        proxy = {proxy_protocol: proxy_ip + ':' + proxy_port}

        # 使用代理进行连接
        self.session.headers = header
        self.session.proxies = proxy
        retry_time = 0
        while retry_time < NETWORK_RECONNECT_TIMES:
            try:
                response = self.session.get(url, timeout=CONNECT_TIMEOUT)

                # 解析返回的当前使用的IP并判断是否有效
                match_list = re.findall(r'[0-9]+(?:\.[0-9]+){3}', response.text)
                if len(match_list) > 0:
                    current_ip = match_list.pop()
                    if current_ip is not None and current_ip == proxy_ip:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug("获取到一个可用的代理IP")
                        return True
                else:
                    retry_time += 1
            except Exception:
                retry_time += 1
        return False
예제 #5
0
    def run(self):
        try:
            while True:
                raw_data = None
                token = None
                thread_name = None

                data = self.cache_queue.get_data_from_user_info_cache_queue()

                if QUEUE_ELEM_HTML in data:
                    raw_data = data[QUEUE_ELEM_HTML]
                if QUEUE_ELEM_TOKEN in data:
                    token = data[QUEUE_ELEM_TOKEN]
                if QUEUE_ELEM_THREAD_NAME in data:
                    thread_name = data[QUEUE_ELEM_THREAD_NAME]

                if raw_data is not None and token is not None:
                    user_info = self.parse_user_information(raw_data, token)
                    if user_info is not None:
                        if log.isEnabledFor(logging.DEBUG):
                            log.debug('[' + thread_name + "]搜索到一个用户:" +
                                      user_info[USER_NAME])
                        # 使用布隆过滤器标记
                        self.bloom_filter.mark_value(user_info[USER_URL_TOKEN])
                        self.db_connection.add_user_info(
                            self.convert_user_info(user_info))
                        # 封装代分析用户关注列表的token信息
                        token_info = {
                            USER_URL_TOKEN: user_info[USER_URL_TOKEN],
                            USER_FOLLOWING_COUNT:
                            user_info[USER_FOLLOWING_COUNT],
                            USER_FOLLOWER_COUNT: user_info[USER_FOLLOWER_COUNT]
                        }
                        # print(token_info)
                        self.user_token_cache_queue.add_token_into_analysed_cache_queue(
                            [token_info])
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.exception(e)
            self.status = 'error'
예제 #6
0
    def __init__(self):
        # 初始化配置参数
        self.config_init()

        # 初始化数据库模块
        self.DBConnectModule = DBConnector.DBConnectModule()
        # 初始化 BloomFilter 模块
        self.bloomFilterModule = BloomFilter.BloomFilter()
        # 初始化用户Token缓存
        self.userTokenCacheQueue = UserList.UserTokenCacheQueue(
            self.DBConnectModule)
        # 初始化待分析网页缓存
        self.cacheQueue = DataParser.CacheQueue()
        # 初始化数据获取模块
        self.dataFetchModule = DataFetch.DataFetchModule(IS_PROXY_ENABLE)
        # 初始化数据解析模块
        self.dataParseModule = DataParser.DataParseModule(
            self.DBConnectModule, self.userTokenCacheQueue, self.cacheQueue,
            self.bloomFilterModule)
        # 初始化邮件服务模块
        if IS_EMAIL_NOTIFICATION_ENABLE is True:
            self.emailService = EmailService.EmailService(self.DBConnectModule)

        # 初始化用户信息爬取线程
        self.user_info_scrape_thread_list = []
        for thread_count in range(USER_INFO_SCRAPE_THREAD_NUM):
            thread_name = 'Info-Thread' + str(thread_count)
            user_info_scrape_thread = UserInfoScrapeThread(
                thread_name, self.dataFetchModule, self.userTokenCacheQueue,
                self.cacheQueue, self.bloomFilterModule)
            self.user_info_scrape_thread_list.append(user_info_scrape_thread)

        # 初始化用户列表爬取线程
        self.user_list_scrape_thread_list = []
        for thread_count in range(USER_LIST_SCRAPE_THREAD_NUM):
            thread_name = 'List-Thread' + str(thread_count)
            user_list_scrape_thread = UserListScrapeThread(
                thread_name, self.DBConnectModule, self.dataFetchModule,
                self.userTokenCacheQueue, self.cacheQueue)
            self.user_info_scrape_thread_list.append(user_list_scrape_thread)

        # 若有起始token则放入
        if start_token != '':
            self.userTokenCacheQueue.add_token_into_cache_queue([start_token])

        if log.isEnabledFor(logging.DEBUG):
            log.debug("爬虫核心模块初始化完毕")
예제 #7
0
    def user_info_scrape(self):

        # 为该线程绑定 session
        self.data_fetch_module.thread_bind_session(self.thread_name)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('用户信息爬取线程[' + self.thread_name + ']开始运行')

        while True:
            # 从未分析 token 缓存列表中获取一个可用的token
            while True:
                token = self.user_token_cache_queue.get_token_from_cache_queue(
                )
                if token is not None:
                    if self.is_token_available(token) is False:
                        break
                else:
                    time.sleep(0.5)

            # 抓取 token 对应用户的个人信息,并保存
            response = self.data_fetch_module.fetch_data_of_url(
                self.generate_user_info_url(token), self.thread_name)

            # 判断返回的数据是否有效,若有效再继续对数据进行分析
            if response is not None:
                if response == 'reuse':
                    # 将该 token 放回队列
                    self.user_token_cache_queue.add_token_into_cache_queue(
                        [token])
                else:
                    # 添加到待分析队列
                    self.cache_queue.add_data_into_user_info_cache_queue({
                        DataParser.QUEUE_ELEM_HTML:
                        response.text,
                        DataParser.QUEUE_ELEM_TOKEN:
                        token,
                        DataParser.QUEUE_ELEM_THREAD_NAME:
                        self.thread_name
                    })

            # 爬取时间间隔
            time.sleep(SCRAPE_TIME_INTERVAL)
예제 #8
0
    def __init__(self, db_connection):
        self.db_connection = db_connection
        self.analysed_cache_queue = queue.Queue(MAX_ANALYSED_CACHE_QUEUE_SIZE)
        self.cache_queue = queue.Queue(MAX_CACHE_QUEUE_SIZE)

        # 配置未分析用户信息列表
        token_total = self.db_connection.get_user_token_num()
        if token_total > 0:
            temp_list = self.get_token_from_db(REMAIN_CACHE_QUEUE_SIZE)
            for token in temp_list:
                self.cache_queue.put(token)
        # 配置已分析用户信息列表
        token_total = self.db_connection.get_analysed_token_num()
        if token_total > 0:
            temp_list = self.get_analysed_token_from_db(
                REMAIN_ANALYSED_CACHE_QUEUE_SIZE)
            for token in temp_list:
                self.analysed_cache_queue.put(token)
        if log.isEnabledFor(logging.DEBUG):
            log.debug('用户 Token 缓存列表初始化完毕')
예제 #9
0
    def send_message(email_content):
        # 准备发送的内容
        now = datetime.datetime.now()
        header = SMTP_EMAIL_HEADER + '[' + str(now.month) + '-' + str(now.day) + ' ' + \
                 str(now.hour) + ':' + str(now.minute) + ':' + str(now.second) + ']'
        msg = MIMEText(email_content, 'plain', 'utf-8')
        msg['from'] = SMTP_FROM_ADDR
        msg['to'] = SMTP_TO_ADDR
        msg['Subject'] = Header(header, 'utf-8').encode()

        # 发送
        try:
            smtp_server = smtplib.SMTP(SMTP_SERVER_HOST, SMTP_SERVER_PORT)
            smtp_server.login(SMTP_FROM_ADDR, SMTP_SERVER_PASSWORD)
            smtp_server.sendmail(SMTP_FROM_ADDR, [SMTP_TO_ADDR],
                                 msg.as_string())
            smtp_server.quit()
        except Exception as e:
            if log.isEnabledFor(logging.ERROR):
                log.error("邮件发送失败")
                log.exception(e)
            print(e)
예제 #10
0
 def start_user_info_data_parse_thread(self):
     self.user_info_data_parse_thread.start()
     if log.isEnabledFor(logging.DEBUG):
         log.debug("用户信息数据解析线程启动")
예제 #11
0
    def parse_user_information(html_string, user_token):
        if html_string is None:
            return None
        # 提取 json 数据
        bs_object = BeautifulSoup(html_string, 'html.parser')
        data_string = bs_object.find('div', attrs={'id': 'data'})
        if data_string is None:
            return None
        else:
            data_string = data_string['data-state']

        # 字符串处理
        # 对转义 html 字符进行处理
        data_string = html.unescape(data_string)
        # 去除夹杂的 html 标签
        data_string = BeautifulSoup(data_string, 'html.parser').text
        # 转换为 json 对象
        try:
            # 防止解析到的 JSON 格式错误而引发异常
            json_data = json.loads(data_string)
        except ValueError:
            if log.isEnabledFor(logging.DEBUG):
                log.debug('[error]解析到错误的 json 数据')
            return None

        # 提取实体
        if JSON_ENTITIES not in json_data:
            return None
        entities = json_data[JSON_ENTITIES]

        # 提取各个用户信息
        if JSON_USERS not in entities:
            return None
        users = entities[JSON_USERS]

        # 提取目标用户
        if user_token not in users:
            return None
        user = users[user_token]

        # 提取目标用户的个人信息
        avatar_url_template = None
        url_token = None
        name = None
        headline = None
        locations = []
        business = None
        employments = []
        educations = []
        description = None
        sina_weibo_url = None
        gender = None
        following_count = None
        follower_count = None
        answer_count = None
        question_count = None
        voteup_count = None

        if USER_AVATAR_URL_TEMPLATE in user:
            avatar_url_template = user[USER_AVATAR_URL_TEMPLATE]

        if USER_URL_TOKEN in user:
            url_token = user[USER_URL_TOKEN]

        if USER_NAME in user:
            name = user[USER_NAME]

        if USER_HEADLINE in user:
            headline = user[USER_HEADLINE]

        if USER_LOCATIONS in user:
            for location in user[USER_LOCATIONS]:
                locations.append(location['name'])

        if USER_BUSINESS in user:
            business = user[USER_BUSINESS]['name']

        if USER_EMPLOYMENTS in user:
            for employment in user[USER_EMPLOYMENTS]:
                elem = {}
                if 'job' in employment:
                    job = employment['job']['name']
                    elem.update({'job': job})
                if 'company' in employment:
                    company = employment['company']['name']
                    elem.update({'company': company})
                employments.append(elem)

        if USER_EDUCATIONS in user:
            for education in user[USER_EDUCATIONS]:
                if 'school' in education:
                    school = education['school']['name']
                    educations.append(school)

        if USER_DESCRIPTION in user:
            description = user[USER_DESCRIPTION]

        if USER_SINAWEIBO_URL in user:
            sina_weibo_url = user[USER_SINAWEIBO_URL]

        if USER_GENDER in user:
            gender = user[USER_GENDER]

        if USER_FOLLOWING_COUNT in user:
            following_count = user[USER_FOLLOWING_COUNT]

        if USER_FOLLOWER_COUNT in user:
            follower_count = user[USER_FOLLOWER_COUNT]

        if USER_ANSWER_COUNT in user:
            answer_count = user[USER_ANSWER_COUNT]

        if USER_QUESTION_COUNT in user:
            question_count = user[USER_QUESTION_COUNT]

        if USER_VOTE_UP_COUNT in user:
            voteup_count = user[USER_VOTE_UP_COUNT]

        # 构造用户信息实体
        user_info = {
            USER_AVATAR_URL_TEMPLATE: avatar_url_template,
            USER_URL_TOKEN: url_token,
            USER_NAME: name,
            USER_HEADLINE: headline,
            USER_LOCATIONS: locations,
            USER_BUSINESS: business,
            USER_EMPLOYMENTS: employments,
            USER_EDUCATIONS: educations,
            USER_DESCRIPTION: description,
            USER_SINAWEIBO_URL: sina_weibo_url,
            USER_GENDER: gender,
            USER_FOLLOWING_COUNT: following_count,
            USER_FOLLOWER_COUNT: follower_count,
            USER_ANSWER_COUNT: answer_count,
            USER_QUESTION_COUNT: question_count,
            USER_VOTE_UP_COUNT: voteup_count
        }
        return user_info
예제 #12
0
 def start_user_list_data_parse_thread(self):
     self.user_list_data_parse_thread.start()
     if log.isEnabledFor(logging.DEBUG):
         log.debug('用户列表数据解析线程启动')
예제 #13
0
    def start_spider(self):
        # 启动定时邮件线程
        if IS_EMAIL_NOTIFICATION_ENABLE is True:
            self.emailService.start_email_notification_service()

        # 启动数据解析线程
        self.dataParseModule.start_user_info_data_parse_thread()
        self.dataParseModule.start_user_list_data_parse_thread()

        # 启动用户信息爬取线程
        for user_info_scrape_thread in self.user_info_scrape_thread_list:
            user_info_scrape_thread.start()

        # 启动用户列表爬取线程
        for user_list_scrape_thread in self.user_list_scrape_thread_list:
            user_list_scrape_thread.start()

        if IS_EMAIL_NOTIFICATION_ENABLE is True:
            self.emailService.send_message("爬虫启动成功")

        # 工作线程检测并重启
        while True:
            # 检测邮件服务线程
            if IS_EMAIL_NOTIFICATION_ENABLE is True:
                if self.emailService.get_email_notification_service_status(
                ) == 'error':
                    self.emailService.restart_email_notification_service()
                    if log.isEnabledFor(logging.ERROR):
                        log.error('邮件服务线程重新启动')

            # 检测用户信息解析线程
            if self.dataParseModule.get_user_info_data_parse_thread_status(
            ) == 'error':
                self.dataParseModule.restart_user_info_data_parse_thread()
                if log.isEnabledFor(logging.ERROR):
                    log.error('用户信息解析线程重新启动')

            # 检测用户列表解析线程
            if self.dataParseModule.get_user_list_data_parse_thread_status(
            ) == 'error':
                self.dataParseModule.restart_user_list_data_parse_thread()
                if log.isEnabledFor(logging.ERROR):
                    log.error('用户信息解析线程重新启动')

            # 检测用户信息爬取线程
            for thread in self.user_info_scrape_thread_list:
                if thread.status == 'error':
                    thread_name = thread.thread_name
                    self.user_info_scrape_thread_list.remove(thread)
                    new_thread = UserInfoScrapeThread(thread_name,
                                                      self.DBConnectModule,
                                                      self.dataFetchModule,
                                                      self.userTokenCacheQueue,
                                                      self.cacheQueue)
                    self.user_info_scrape_thread_list.append(new_thread)
                    new_thread.start()
                    if log.isEnabledFor(logging.ERROR):
                        log.error('用户信息爬取线程[' + thread_name + ']重新启动')

            # 检测用户列表爬取线程
            for thread in self.user_list_scrape_thread_list:
                if thread.status == 'error':
                    thread_name = thread.thread_name
                    self.user_list_scrape_thread_list.remove(thread)
                    new_thread = UserListScrapeThread(thread_name,
                                                      self.DBConnectModule,
                                                      self.dataFetchModule,
                                                      self.userTokenCacheQueue,
                                                      self.cacheQueue)
                    self.user_list_scrape_thread_list.append(new_thread)
                    new_thread.start()
                    if log.isEnabledFor(logging.ERROR):
                        log.error('用户列表爬取线程[' + thread_name + ']重新启动')

            # 检测间隔
            time.sleep(180)
예제 #14
0
    def user_list_scrape(self):

        # 为该线程绑定 session
        self.data_fetch_module.thread_bind_session(self.thread_name)

        if log.isEnabledFor(logging.DEBUG):
            log.debug('用户列表爬取线程[' + self.thread_name + ']开始运行')

        while True:
            # 从已分析 token 缓存列表中获取一个可用的token
            while True:
                token_info = self.user_token_cache_queue.get_token_form_analysed_cache_queue(
                )
                if token_info is not None:
                    break
                time.sleep(0.5)

            # 分析正在关注列表
            if ANALYSE_FOLLOWING_LIST is True:
                # 计算页码范围
                following_page_size = 1
                if DataParser.USER_FOLLOWING_COUNT in token_info:
                    following_page_size = self.calculate_max_page(
                        token_info[DataParser.USER_FOLLOWING_COUNT])
                if 0 < FOLLOWING_PAGE_MAX < following_page_size:
                    following_page_size = FOLLOWING_PAGE_MAX

                # 开始分析
                cur_page = 1
                while cur_page <= following_page_size:
                    # 获取数据
                    following_list_response = self.data_fetch_module.fetch_data_of_url(
                        self.generate_following_list_url(
                            token_info[DataParser.USER_URL_TOKEN], cur_page),
                        self.thread_name)

                    # 判断返回的数据是否有效,若有效再对数据进行分析
                    if following_list_response is not None:
                        if following_list_response == 'reuse':
                            # 重新分析该页的列表
                            continue
                        else:
                            # 添加到分析队列
                            self.cache_queue.add_data_into_user_list_cache_queue(
                                {
                                    DataParser.QUEUE_ELEM_HTML:
                                    following_list_response.text,
                                    DataParser.QUEUE_ELEM_TOKEN:
                                    token_info[DataParser.USER_URL_TOKEN],
                                    DataParser.QUEUE_ELEM_THREAD_NAME:
                                    self.thread_name
                                })
                            cur_page += 1

                    time.sleep(SCRAPE_TIME_INTERVAL)

            # 分析关注者列表
            if ANALYSE_FOLLOWER_LIST is True:
                # 计算页码范围
                follower_page_size = 1
                if DataParser.USER_FOLLOWER_COUNT in token_info:
                    follower_page_size = self.calculate_max_page(
                        token_info[DataParser.USER_FOLLOWER_COUNT])
                if follower_page_size > FOLLOWER_PAGE_MAX > 0:
                    follower_page_size = FOLLOWER_PAGE_MAX

                # 开始分析
                cur_page = 1
                while cur_page <= follower_page_size:
                    # 获取数据
                    follower_list_response = self.data_fetch_module.fetch_data_of_url(
                        self.generate_follower_list_url(
                            token_info[DataParser.USER_URL_TOKEN], cur_page),
                        self.thread_name)

                    # 判断返回的数据是否有效,若有效再继续对数据进行分析
                    if follower_list_response is not None:
                        if follower_list_response == 'reuse':
                            # 重新分析该页的列表
                            continue
                        else:
                            # 添加到待分析队列
                            self.cache_queue.add_data_into_user_list_cache_queue(
                                {
                                    DataParser.QUEUE_ELEM_HTML:
                                    follower_list_response.text,
                                    DataParser.QUEUE_ELEM_TOKEN:
                                    token_info[DataParser.USER_URL_TOKEN],
                                    DataParser.QUEUE_ELEM_THREAD_NAME:
                                    self.thread_name
                                })
                            cur_page += 1

                    time.sleep(SCRAPE_TIME_INTERVAL)