def __init__(self, Url_inf, KafkaOperator):
     self.myDb = Mongodb_Operator(DbdataCCGPDFZB["host"],
                                  DbdataCCGPDFZB["port"],
                                  DbdataCCGPDFZB["db_name"],
                                  DbdataCCGPDFZB["default_collection"])
     self.Url_inf = Url_inf
     self.KafkaOperator = KafkaOperator
    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)
Пример #3
0
    def run_downloader(self, pipeDictData):
        """
        下载驱动器
        :param queueDictData:数据源(队列)
        """
        #改成一个ip下载同一个网站机制
        ipProxy = None
        while True:
            # Log.i('run_downloader in {0}'.format(time.ctime()))
            #获取数据源
            DictData = pipeDictData.recv()

            # 数据来了再创建数据库链接
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"])
            #数据来了再去获取ip
            if DictData is not None:
                # 获取免费ip
                if ipProxy is None:
                    ipProxy = self.getIpPoolMethod()
                # 异步执行下载
                self.ayncDownloadTask(ipProxy, DictData)
Пример #4
0
    def simpleRun(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #资源检查
            # KafkaOperator = kafkaUrlinformation()
            KafkaOperator = localKafkaUrlinformation()
            # if self.mogodbControl is None:
            #     self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"],
            #                                       Dbdata["default_collection"])
            #解析数据源
            # if USE_SOURCEURL_TYPE is True:
            #     if USE_ASYNCTASK_TYPE is True:
            #         urlInformationList = ConfigUtil.readSourceListRealTime()
            #     else:
            #         urlInformationList = ConfigUtil.readSourceList()
            # else:
            #     urlInformationList = ConfigUtil.readTaskList()

            urlInformationList = ConfigUtil.readSourceListRealTime()

            #爬取,解析子URL
            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()

                #获取首页内容
                dowloadData = self.downLoadHtml(data)
                if dowloadData is None:
                    continue
                # 解析提取分页url
                pageData = self.getPageNumFromHome(dowloadData)
                if pageData is None:
                    continue
                for pageIndex in pageData:
                    # 获取首页内容
                    dowloadPageData = self.downLoadHtml(pageIndex.class2dict())
                    if dowloadPageData is None:
                        continue
                    #提取子链接
                    # self.URL_inf.dict2class(pageIndex)
                    ccgpChildrenLink = self.getChildrenLink(dowloadPageData)
                    if ccgpChildrenLink is None:
                        continue
                    #KAFKA下发子链接
                    for link in ccgpChildrenLink:
                        # 检查Mongo
                        if self.mogodbControl is None:
                            self.mogodbControl = Mongodb_Operator(
                                DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                                DbdataCCGPDFZB["db_name"],
                                DbdataCCGPDFZB["default_collection"])
                        # 查重,不重复发送到kafka节省资源
                        if link.title is None:  #标题为空的不发送
                            continue
                        uuid = self.get_md5(link.Urlname, link.title)
                        item = {"uuid": uuid}
                        value = self.mogodbControl.findone(
                            item, self.__Sendcollection)  # 查询到返回文档
                        # #TODO 插入数据库有问题
                        if value is not None:  #数据库查重
                            continue
                        # 于浩说不要发父链接给他
                        if link.DeepNum >= 0:
                            producerData = json.dumps(link.class2dict())
                            Log.i("produce<<" + producerData)
                            KafkaOperator.producerUrl(producerData)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab == 1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items = ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds = random.randint(int(interval_min), int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
Пример #5
0
class simpleProducerChildUrl():
    def __init__(self, processName, crontab):
        self.processName = processName
        self.crontab = crontab
        #Initial ipProxy and heads
        self.ipProxy = self.getIpPoolMethod()
        self.headersEngine = HeadersEngine()
        self.heads = self.headersEngine.getHeaders()
        #数据库模型和控制器
        self.URL_inf = URLinformation()
        self.__Sendcollection = "httpsearchccgpgovcn"
        self.mogodbControl = None

    def downLoadHtml(self, data):
        """
        爬取并提取子链接
        :param urlInfor:
        """
        if self.ipProxy is None:
            self.ipProxy = self.getIpPoolMethod()
        if self.heads is None:
            self.heads = self.headersEngine.getHeaders()

        # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn',
        #  'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0,
        #  'content': None,
        #  'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=',
        #  'SleepTime': 0.0, 'FilePath': None}

        html = None  #爬取网页所有内容
        ctifety = 0  #解析子链接标志位
        Flag = 1  #爬取完成标志位
        count = 0  #空网页计算器
        while (Flag):
            try:
                if count > 1:
                    self.ipProxy = self.getIpPoolMethod()

                protocol = 'https' if 'https' in self.ipProxy else 'http'
                proxiesmmm = {protocol: self.ipProxy}

                req = requests.get(data['Urlname'],
                                   headers=self.heads,
                                   allow_redirects=False,
                                   proxies=proxiesmmm,
                                   timeout=3)
                # 跳过验证反扒机制
                soup_validate = BeautifulSoup(req.text, 'lxml')
                if soup_validate.find(name='title').string == '安全验证':
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                if req.status_code != 200:
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    data = self.__downlowdFile(data=data, req=req)
                    data['Download'] = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    data['Download'] = 0
                    ctifety = 1
                    Flag = 0  # 该回大部队了
                else:
                    continue
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                count += 1
                if html is None:
                    Flag = 1
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count += 1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                requests.adapters.DEFAULT_RETRIES = 5
                s = requests.session()
                s.keep_alive = False
                if html is None:
                    Flag = 1
                # if count > 4:
                #     return None
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass

        if ctifety:
            data['content'] = html
            soup = BeautifulSoup(html, 'html.parser')  #很棒棒的bs简单解析下
        else:
            soup = None

        data['soup'] = soup
        # Log.i(data['content'].decode('utf-8'))
        return data

    def getChildrenLink(self, pageIndex):
        """
        获取子链接
        :return:
        """
        pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*'
        pattern = re.compile(pattern)
        # print("domain" + str(self.Url_inf.Urlname))
        Keyvalue = pattern.search(pageIndex['Urlname'])
        # Keyvalue  <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'>
        # print("Keyvalue  " + str(Keyvalue))
        # print(self.Url_inf.Urlname)
        if Keyvalue != None:
            Keyvalue = Keyvalue.group()
        else:
            Keyvalue = domain = urlparse(
                pageIndex['Urlname']).scheme + "://" + urlparse(
                    pageIndex['Urlname']).netloc

        domain = Keyvalue
        URL_infor = []
        URL_infor2 = []
        Links = []
        link2 = ''
        title = ''
        currentTime = ''
        total_title = ''

        # if self.Url_inf.soup == None:
        #     return []
        if USE_BXBLS is True:
            #分成两个业务
            # if self.Url_inf.Urlname.find("zygg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # elif self.Url_inf.Urlname.find("dfgg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # else:
            #     ul_content = self.Url_inf.soup
            if pageIndex['soup'] is None:
                return []
            else:
                urlInfoList = pageIndex['soup'].select(
                    ".vT-srch-result-list-bid")

            if urlInfoList is None:
                return []

            if urlInfoList:
                ul_content = urlInfoList[0]
            else:
                return []

            for li in ul_content.select("li"):
                link = li.select("a")[0]

                # emProvince = li.select("span")[2].get_text()
                spanProvince = li.select("span")[0]
                emProvince = spanProvince.select("a")[0].get_text()
                currentTime = time.time()

                try:
                    href2 = link['href']
                    total_title = link['title']
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)
                # else:
                if (
                        href2.startswith("/")
                ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                    # link2 = urljoin(self.Url_inf.Urlname, href2)
                    # print(str(link2))

                    # link2=self.Url_inf.Urlname+href2
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                elif (href2.startswith("../../..")):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('../../..',domain)
                elif href2.startswith(".."):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('..',domain)
                elif href2.startswith("./"):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('./',domain+'/')
                elif 'http' in href2 and 'gov' in href2:
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2

                link2 = urljoin(pageIndex['Urlname'], href2)
                # print("link2 is :" + str(link2))
                #title不全的问题
                if title.find("...") > -1:
                    title = total_title

                title = title.strip('\r')
                myLinkUrl = URLinformation(Urlname=link2,
                                           title=title,
                                           DeepNum=pageIndex['DeepNum'] - 1,
                                           domain=pageIndex['domain'],
                                           fatherUrl=pageIndex['Urlname'],
                                           province=emProvince,
                                           LastTime=currentTime)
                URL_infor.append(myLinkUrl)

        else:
            for link in pageIndex['soup'].select("a"):
                # print(str(self.Url_inf.soup))
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank">
                #                                         南方科技大学等离子体技术基础仪器采购项目招标公告
                #                                     </a>
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank">
                #                                         2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告
                # print("children url is : "+ str(link))
                try:
                    href2 = link['href']  # 取出href对应的网站信息 具体信息如上
                    # print("href2:   " + str(href2))
                    # 取出的信息包含情况如下3种
                    # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm
                    # javascript:void(0)
                    # #
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)

                else:  # try正确运行 则运行else
                    if (
                            href2.startswith("/")
                    ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                        # link2 = urljoin(self.Url_inf.Urlname, href2)
                        # print(str(link2))

                        # link2=self.Url_inf.Urlname+href2
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                    elif (href2.startswith("../../..")):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('../../..',domain)
                    elif href2.startswith(".."):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('..',domain)
                    elif href2.startswith("./"):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('./',domain+'/')
                    elif 'http' in href2 and 'gov' in href2:
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2

                    link2 = urljoin(pageIndex['Urlname'], href2)
                    # print("link2 is :" + str(link2))
                    myLinkUrl = URLinformation(Urlname=link2,
                                               title=title,
                                               DeepNum=pageIndex['DeepNum'] -
                                               1,
                                               domain=pageIndex['domain'],
                                               fatherUrl=pageIndex['Urlname'])
                    URL_infor.append(myLinkUrl)

        if USE_BXBLS is True:
            Links = list(set(URL_infor))
        else:
            #TODO 出现AttributeError: 'NoneType' object has no attribute 'select'
            for http in pageIndex['soup'].select('option'):  # 暂时未知有何内容
                try:
                    http2 = http['value']
                    # print("option" + str(http))
                except KeyError:
                    pageIndex['soup'].select("option").remove(http)
                else:
                    if "gov" in http2 and 'http' in http2:
                        myLinkUrl2 = URLinformation(
                            Urlname=http2,
                            title=http.text,
                            DeepNum=pageIndex['DeepNum'] - 1,
                            domain=pageIndex['domain'],
                            fatherUrl=pageIndex['Urlname'])
                        URL_infor2.append(myLinkUrl2)

            Links = list(set(URL_infor + URL_infor2))

        #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56  children urls1
        Log.i("This url have " + str(len(Links)) + "  children urls" +
              str(pageIndex['DeepNum']))
        return Links

    def getPageNumFromHome(self, dowloadData):
        """
        获取分页的页码URL
        """
        if dowloadData['soup'] is None:
            return []
        else:
            # Log.i(dowloadData['content'].decode('utf-8'))
            selector = etree.HTML(dowloadData['content'].decode('utf-8'))

            try:
                page = (int(
                    selector.xpath(
                        '//div[@class="vT_z"]/div[1]/div/p[1]/span[2]/text()')
                    [0]) // 20) + 3
            except:
                return []

            if page is None:
                return []
            parentURL_infor = []
            #随机数的方法判断倒序
            num = random.randint(3, 7)
            #存放url
            tempUrl = ''
            if (num % 2) == 0:
                for i in range(1, page):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    #TODO 这里拼接数据有问题
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor
            else:
                for i in range(page - 1, 0, -1):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='

                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor

    def __downlowdFile(self, data, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = data['Urlname'].split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split(
                    'filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(data['Urlname'])
                if r.url != data['Urlname']:
                    fileName = basename(urlsplit(r.url)[2])
            data['FileName'] = fileName

        _FileName = None
        if (data['FilePath']):
            _FileName = str(data['FilePath']) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:" + _FileName + "downLoaded")
        return data

    def get_md5(self, url, content):

        #如果content为none,只编码url
        """
        用MD5编码内容生成
        :return:
        """
        return hashlib.md5((urllib.parse.unquote(url).join(
            str(content)).encode("utf-8"))).hexdigest()  # 使用md5编码

    def __checkURL(self, urlName):
        """
        查重函数
        :param urlName:
        :return:
        """
        item = {"_id": urlName}
        value = self.mogodbControl.findone(item,
                                           self.__Sendcollection)  # 查询到返回文档
        if value == None:  # 说明没找到
            return False
        else:
            return True  # 说明找到了

    def getIpPoolMethod(self):
        """
        获取免费代理ip
        :return: 返回一个免费的ip代理
        """
        ipProxy = None
        if ipProxy is None:
            if USE_PROXY is True:
                #获取代理的时候保证能至少有一个IP
                proxyIpPool = getIpProxyPool()
                if proxyIpPool is not None:
                    ipProxy = proxyIpPool

                if ipProxy is None:
                    ipProxy = PROXY_NONE_URL
            else:
                ipProxy = PROXY_NONE_URL

        return ipProxy

    def simpleRun(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #资源检查
            # KafkaOperator = kafkaUrlinformation()
            KafkaOperator = localKafkaUrlinformation()
            # if self.mogodbControl is None:
            #     self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"],
            #                                       Dbdata["default_collection"])
            #解析数据源
            # if USE_SOURCEURL_TYPE is True:
            #     if USE_ASYNCTASK_TYPE is True:
            #         urlInformationList = ConfigUtil.readSourceListRealTime()
            #     else:
            #         urlInformationList = ConfigUtil.readSourceList()
            # else:
            #     urlInformationList = ConfigUtil.readTaskList()

            urlInformationList = ConfigUtil.readSourceListRealTime()

            #爬取,解析子URL
            if urlInformationList is None:
                continue

            for urlInfor in urlInformationList:
                data = urlInfor.class2dict()

                #获取首页内容
                dowloadData = self.downLoadHtml(data)
                if dowloadData is None:
                    continue
                # 解析提取分页url
                pageData = self.getPageNumFromHome(dowloadData)
                if pageData is None:
                    continue
                for pageIndex in pageData:
                    # 获取首页内容
                    dowloadPageData = self.downLoadHtml(pageIndex.class2dict())
                    if dowloadPageData is None:
                        continue
                    #提取子链接
                    # self.URL_inf.dict2class(pageIndex)
                    ccgpChildrenLink = self.getChildrenLink(dowloadPageData)
                    if ccgpChildrenLink is None:
                        continue
                    #KAFKA下发子链接
                    for link in ccgpChildrenLink:
                        # 检查Mongo
                        if self.mogodbControl is None:
                            self.mogodbControl = Mongodb_Operator(
                                DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                                DbdataCCGPDFZB["db_name"],
                                DbdataCCGPDFZB["default_collection"])
                        # 查重,不重复发送到kafka节省资源
                        if link.title is None:  #标题为空的不发送
                            continue
                        uuid = self.get_md5(link.Urlname, link.title)
                        item = {"uuid": uuid}
                        value = self.mogodbControl.findone(
                            item, self.__Sendcollection)  # 查询到返回文档
                        # #TODO 插入数据库有问题
                        if value is not None:  #数据库查重
                            continue
                        # 于浩说不要发父链接给他
                        if link.DeepNum >= 0:
                            producerData = json.dumps(link.class2dict())
                            Log.i("produce<<" + producerData)
                            KafkaOperator.producerUrl(producerData)

            #日执行一次不用休眠了.使用crontab定时任务驱动
            if self.crontab == 1:
                os._exit(0)
            else:
                # #休眠n秒(从配置文件中读取)
                items = ConfigUtil.getItems('producerScheduler')
                interval_min = items['interval_min']
                interval_max = items['interval_max']
                seconds = random.randint(int(interval_min), int(interval_max))
                Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
                time.sleep(seconds)
class ParserCCGPModule():
    def __init__(self, Url_inf, KafkaOperator):
        self.myDb = Mongodb_Operator(DbdataCCGPDFZB["host"],
                                     DbdataCCGPDFZB["port"],
                                     DbdataCCGPDFZB["db_name"],
                                     DbdataCCGPDFZB["default_collection"])
        self.Url_inf = Url_inf
        self.KafkaOperator = KafkaOperator

    def get_md5(self):

        #如果content为none,只编码url
        """
        用MD5编码内容生成
        :return:
        """
        # if self.Url_inf.title != None:
        #     return hashlib.md5((self.Url_inf.Urlname+self.Url_inf.title).encode("utf-8")).hexdigest()#使用md5编码
        # else:
        #     return hashlib.md5((self.Url_inf.Urlname).encode("utf-8")).hexdigest()#使用md5编码

        # if self.Url_inf.content is not  None and self.Url_inf.Urlname is not None and self.Url_inf.title is not None:
        #     return hashlib.md5((urllib.parse.unquote(self.Url_inf.Urlname).join(str(self.Url_inf.content)).encode("utf-8"))).hexdigest()  # 使用md5编码
        #     # return hashlib.md5((self.Url_inf.Urlname+self.Url_inf.title.decode("utf8","ignore")).encode("utf-8")).hexdigest()#使用md5编码
        # else:
        #     return hashlib.md5((urllib.parse.unquote(self.Url_inf.Urlname).join(str(self.Url_inf.content)).encode(
        #         "utf-8"))).hexdigest()  # 使用md5编码
        return hashlib.md5((urllib.parse.unquote(self.Url_inf.Urlname).join(
            str(self.Url_inf.content)).encode("utf-8"))).hexdigest()  # 使用md5编码

    # @AsycThread.async
    def getChildrenLink(self):
        """
        获取子链接
        :return:
        """
        pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*'
        pattern = re.compile(pattern)
        # print("domain" + str(self.Url_inf.Urlname))
        Keyvalue = pattern.search(self.Url_inf.Urlname)
        # Keyvalue  <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'>
        # print("Keyvalue  " + str(Keyvalue))
        # print(self.Url_inf.Urlname)
        if Keyvalue != None:
            Keyvalue = Keyvalue.group()
        else:
            Keyvalue = domain = urlparse(
                self.Url_inf.Urlname).scheme + "://" + urlparse(
                    self.Url_inf.Urlname).netloc

        domain = Keyvalue
        URL_infor = []
        URL_infor2 = []
        Links = []
        link2 = ''
        title = ''
        total_title = ''

        # if self.Url_inf.soup == None:
        #     return []
        if USE_BXBLS is True:
            #分成两个业务
            # if self.Url_inf.Urlname.find("zygg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # elif self.Url_inf.Urlname.find("dfgg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # else:
            #     ul_content = self.Url_inf.soup
            if self.Url_inf.soup is None:
                return []
            else:
                urlInfoList = self.Url_inf.soup.select(".c_list_bid")

            if urlInfoList is None:
                return []

            if urlInfoList:
                ul_content = urlInfoList[0]
            else:
                return []

            for li in ul_content.select("li"):
                link = li.select("a")[0]
                emProvince = li.select("em")[2].get_text()

                try:
                    href2 = link['href']
                    total_title = link['title']
                except KeyError:
                    self.Url_inf.soup.select("a").remove(link)
                else:
                    if (
                            href2.startswith("/")
                    ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                        # link2 = urljoin(self.Url_inf.Urlname, href2)
                        # print(str(link2))

                        # link2=self.Url_inf.Urlname+href2
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                    elif (href2.startswith("../../..")):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('../../..',domain)
                    elif href2.startswith(".."):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('..',domain)
                    elif href2.startswith("./"):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('./',domain+'/')
                    elif 'http' in href2 and 'gov' in href2:
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2

                    link2 = urljoin(self.Url_inf.Urlname, href2)
                    # print("link2 is :" + str(link2))
                    #title不全的问题
                    if title.find("...") > -1:
                        title = total_title

                    myLinkUrl = URLinformation(Urlname=link2,
                                               title=title,
                                               DeepNum=self.Url_inf.DeepNum -
                                               1,
                                               domain=self.Url_inf.domain,
                                               fatherUrl=self.Url_inf.Urlname,
                                               province=emProvince)
                    URL_infor.append(myLinkUrl)

        else:
            for link in self.Url_inf.soup.select("a"):
                # print(str(self.Url_inf.soup))
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank">
                #                                         南方科技大学等离子体技术基础仪器采购项目招标公告
                #                                     </a>
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank">
                #                                         2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告
                # print("children url is : "+ str(link))
                try:
                    href2 = link['href']  # 取出href对应的网站信息 具体信息如上
                    # print("href2:   " + str(href2))
                    # 取出的信息包含情况如下3种
                    # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm
                    # javascript:void(0)
                    # #
                except KeyError:
                    self.Url_inf.soup.select("a").remove(link)

                else:  # try正确运行 则运行else
                    if (
                            href2.startswith("/")
                    ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                        # link2 = urljoin(self.Url_inf.Urlname, href2)
                        # print(str(link2))

                        # link2=self.Url_inf.Urlname+href2
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                    elif (href2.startswith("../../..")):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('../../..',domain)
                    elif href2.startswith(".."):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('..',domain)
                    elif href2.startswith("./"):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('./',domain+'/')
                    elif 'http' in href2 and 'gov' in href2:
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2

                    link2 = urljoin(self.Url_inf.Urlname, href2)
                    # print("link2 is :" + str(link2))
                    myLinkUrl = URLinformation(Urlname=link2,
                                               title=title,
                                               DeepNum=self.Url_inf.DeepNum -
                                               1,
                                               domain=self.Url_inf.domain,
                                               fatherUrl=self.Url_inf.Urlname)
                    URL_infor.append(myLinkUrl)

        if USE_BXBLS is True:
            Links = list(set(URL_infor))
        else:
            #TODO 出现AttributeError: 'NoneType' object has no attribute 'select'
            for http in self.Url_inf.soup.select('option'):  # 暂时未知有何内容
                try:
                    http2 = http['value']
                    # print("option" + str(http))
                except KeyError:
                    self.Url_inf.soup.select("option").remove(http)
                else:
                    if "gov" in http2 and 'http' in http2:
                        myLinkUrl2 = URLinformation(
                            Urlname=http2,
                            title=http.text,
                            DeepNum=self.DeepNum - 1,
                            domain=self.Url_inf.domain,
                            fatherUrl=self.Url_inf.Urlname)
                        URL_infor2.append(myLinkUrl2)

            Links = list(set(URL_infor + URL_infor2))

        #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56  children urls1
        Log.i("This url have " + str(len(Links)) + "  children urls" +
              str(self.Url_inf.DeepNum))
        return Links

    @AsycThread. async
    def savedata(self):
        """
        保存数据到mongodb
        """
        uuid = self.get_md5()
        item = {
            "uuid": uuid,
            "url": self.Url_inf.Urlname,
            "title": self.Url_inf.title,
            "time": datetime.now().timestamp(),
            "lastTime": self.Url_inf.LastTime,
            "content": self.Url_inf.content,
            "fatherUrl": self.Url_inf.fatherUrl,
            "province": self.Url_inf.province
        }
        string = self.Url_inf.domain.replace('.',
                                             '').replace('/',
                                                         '').replace(':', '')
        # 查重 删除 替换
        if USE_SOURCEURL_TYPE is True:
            if self.Url_inf.province is not None and self.Url_inf.content is not None:
                value = self.myDb.findone({"uuid": uuid})  # 查询到返回文档
                if value is None:
                    Log.i(self.Url_inf.content.decode('utf-8'))
                    self.myDb.insert(item, string)
                    self.myDb.ensure_index("uuid", string)
                    self.KafkaOperator.producterUUID(
                        json.dumps({
                            "uuid": uuid,
                            'collection': string
                        }))
        else:
            self.myDb.insert(item, string)
            self.myDb.ensure_index("uuid", string)
            self.KafkaOperator.producterUUID(
                json.dumps({
                    "uuid": uuid,
                    'collection': string
                }))

    def getLinks(self):
        """
        对外接口获取子链接
        :return:
        """
        if self.Url_inf.Download:
            return []
        else:
            #异步存储
            self.savedata()
            return self.getChildrenLink()

    def setURL_inf(self, URL_inf):
        """
        接口
        :param URL_inf:
        """
        self.Url_inf = URL_inf
class ComsumerChildUrl(multiprocessing.Process):
    def __init__(self, processName, pipeDictData):
        multiprocessing.Process.__init__(self)
        self.processName = processName
        #Initial ipProxy and heads
        self.ipProxy = self.getIpPoolMethod()
        self.headersEngine = HeadersEngine()
        self.heads = self.headersEngine.getHeaders()
        #数据库模型和控制器
        self.URL_inf = URLinformation()
        self.__Sendcollection = "httpsearchccgpgovcn"
        self.mogodbControl = None
        self.KafkaOperator = None
        self.pipeDictData = pipeDictData  # 任务url消息队列

    def downLoadHtml(self):
        """
        爬取并提取子链接
        :param urlInfor:
        """
        if self.ipProxy is None:
            self.ipProxy = self.getIpPoolMethod()
        if self.heads is None:
            self.heads = self.headersEngine.getHeaders()

        # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn',
        #  'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0,
        #  'content': None,
        #  'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=',
        #  'SleepTime': 0.0, 'FilePath': None}

        html = None  #爬取网页所有内容
        ctifety = 0  #解析子链接标志位
        Flag = 1  #爬取完成标志位
        count = 0  #空网页计算器
        while (Flag):
            try:
                if count > 1:
                    self.ipProxy = self.getIpPoolMethod()

                protocol = 'https' if 'https' in self.ipProxy else 'http'
                proxiesmmm = {protocol: self.ipProxy}

                req = requests.get(self.URL_inf.Urlname,
                                   headers=self.heads,
                                   allow_redirects=False,
                                   proxies=proxiesmmm,
                                   timeout=3)
                # 跳过验证反扒机制
                soup_validate = BeautifulSoup(req.text, 'lxml')
                if soup_validate.find(name='title').string == '安全验证':
                    self.ipProxy = self.getIpPoolMethod()
                    continue
                if req.status_code != 200:
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    data = self.__downlowdFile(data=self.URL_inf, req=req)
                    data['Download'] = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    self.URL_inf.Download = 0
                    ctifety = 1
                    Flag = 0  # 该回大部队了
                else:
                    continue
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                count += 1
                if html is None:
                    Flag = 1
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count += 1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                requests.adapters.DEFAULT_RETRIES = 5
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass

        if ctifety:
            self.URL_inf.content = html
            soup = BeautifulSoup(html, 'html.parser')  #很棒棒的bs简单解析下
        else:
            soup = None

        self.URL_inf.soup = soup
        Log.i(self.URL_inf.content.decode('utf-8'))
        return self.URL_inf

    def __downlowdFile(self, data, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = data['Urlname'].split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split(
                    'filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(data['Urlname'])
                if r.url != data['Urlname']:
                    fileName = basename(urlsplit(r.url)[2])
            data['FileName'] = fileName

        _FileName = None
        if (data['FilePath']):
            _FileName = str(data['FilePath']) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:" + _FileName + "downLoaded")
        return data

    def get_md5(self, url, content):

        #如果content为none,只编码url
        """
        用MD5编码内容生成
        :return:
        """
        return hashlib.md5((urllib.parse.unquote(url).join(
            str(content)).encode("utf-8"))).hexdigest()  # 使用md5编码

    def __checkURL(self, urlName):
        """
        查重函数
        :param urlName:
        :return:
        """
        item = {"_id": urlName}
        value = self.mogodbControl.findone(item,
                                           self.__Sendcollection)  # 查询到返回文档
        if value == None:  # 说明没找到
            return False
        else:
            return True  # 说明找到了

    def getIpPoolMethod(self):
        """
        获取免费代理ip
        :return: 返回一个免费的ip代理
        """
        ipProxy = None
        if ipProxy is None:
            if USE_PROXY is True:
                #获取代理的时候保证能至少有一个IP
                proxyIpPool = getIpProxyPool()
                if proxyIpPool is not None:
                    ipProxy = proxyIpPool

                if ipProxy is None:
                    ipProxy = PROXY_NONE_URL
            else:
                ipProxy = PROXY_NONE_URL

        return ipProxy

    # @AsycThread.async
    def savedata(self, data):
        """
        保存数据到mongodb
        """
        #查重
        uuid = self.get_md5(data.Urlname, data.title)
        urlInfo = {
            "uuid": uuid,
            "url": data.Urlname,
            "title": data.title,
            "time": datetime.now().timestamp(),
            "content": data.content,
            "fatherUrl": data.fatherUrl,
            "province": data.province,
            "LastTime": data.LastTime
        }
        string = data.domain.replace('.', '').replace('/', '').replace(':', '')
        #查重 删除 替换
        if data.province is not None and data.content is not None:
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            #TODO 插入数据库有问题
            if value is None:
                # item = {"_id": uuid}
                self.mogodbControl.insert(urlInfo, self.__Sendcollection)
                # self.mogodbControl.ensure_index(item, self.__Sendcollection)
                self.KafkaOperator.producterUUID(
                    json.dumps({
                        "uuid": uuid,
                        'collection': string
                    }))

    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)