예제 #1
0
    def __init__(self, processName, pipeDictData):
        multiprocessing.Process.__init__(self)
        self.processName = processName
        self.pipeDictData = pipeDictData#任务url消息队列

        #数据库模型和控制器
        self.__Sendcollection = "Send_collection"
        self.URL_inf = URLinformation()
        # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"])
        #这里还是设计成单个进程使用Mongo,不然TM给我警告
        self.mogodbControl = None
 def __init__(self, processName, pipeDictData):
     multiprocessing.Process.__init__(self)
     self.processName = processName
     #Initial ipProxy and heads
     self.ipProxy = self.getIpPoolMethod()
     self.headersEngine = HeadersEngine()
     self.heads = self.headersEngine.getHeaders()
     #数据库模型和控制器
     self.URL_inf = URLinformation()
     self.__Sendcollection = "httpsearchccgpgovcn"
     self.mogodbControl = None
     self.KafkaOperator = None
     self.pipeDictData = pipeDictData  # 任务url消息队列
예제 #3
0
    def ayncDownloadTask(self, ipProxy,DictData):
        """
        异步执行爬虫业务
        :param ipProxy:
        :param DictData:
        :return:
        """
        # Log.i(DictData)
        global mutex_lock
        mutex_lock.acquire()  # 临界区开始,互斥的开始

        if self.URL_inf is None:
            self.URL_inf = URLinformation()

        # 源数据处理(实体类)
        self.URL_inf.dict2class(DictData)
        # # 查重业务逻辑
        # if self.__checkURL(self.URL_inf.Urlname):
        #     return
        # else:
        #     item = {"_id": self.__getMD5(self.URL_inf.Urlname)}
        #     self.mogodbControl.insert(item, self.__Sendcollection)

        # TODO 这里沿用王洋以前代码逻辑
        self.URL_inf = self.__getSoupAndDeepnumOrDown(ipProxy)
        if self.URL_inf is None or self.URL_inf.Urlname is None or self.URL_inf.content is None:
            mutex_lock.release()  # 临界区结束,互斥的结束
            return

        # 查重业务逻辑,uid=urlName+content
        # hashlib.md5((self.Url_inf.Urlname + self.Url_inf.content.decode("utf8", "ignore")).encode(
        #     "utf-8")).hexdigest()  # 使用md5编码
        #抛出AttributeError: 'NoneType' object has no attribute 'decode'异常
        # checkUrlUID = self.URL_inf.Urlname+self.URL_inf.title.decode("utf8","ignore")
        # checkUrlUID = urllib.parse.unquote(self.URL_inf.Urlname)
        # checkUrlUID = checkUrlUID.join(str(self.URL_inf.content))

        # checkUrlUID = hashlib.md5((urllib.parse.unquote(self.URL_inf.Urlname).join(str(self.URL_inf.content)).encode(
        #     "utf-8"))).hexdigest()  # 使用md5编码
        #
        # if self.__checkURL(checkUrlUID):
        #     mutex_lock.release()  # 临界区结束,互斥的结束
        #     return
        # else:
        #     item = {"_id": self.__getMD5(checkUrlUID)}
        #     self.mogodbControl.insert(item, self.__Sendcollection)

        #发送子链接
        self._sendChildUrl(self.URL_inf, mutex_lock)

        mutex_lock.release()  # 临界区结束,互斥的结束
예제 #4
0
    def readTaskList():
        MyUrl_SourceList = []
        ftp = open(TASK_FILENAME, 'r')
        for line in ftp.readlines():
            line = line.strip("\n")
            if not UrlUtil.isLegalUrl(line):
                break
            URL_inf = URLinformation(line, 0, 0.0, 0)  # 格式
            URL_inf.Flag = 0
            URL_inf.DeepNum = 1
            URL_inf.domain = UrlUtil.getdomain(line)
            MyUrl_SourceList.append(URL_inf)

        ftp.close()
        return MyUrl_SourceList
예제 #5
0
    def readSourceListByParams(begin, end):
        '''
        构造URL配置文件
        :return: URL LIST对象
        '''
        if USE_BXBLS is True:
            MyUrl_SourceList = []
            ftp = open(SOURCEURL_FILENAME, 'r')
            # http: // www.ccgp.gov.cn / cggg / zygg / index, 0, 0, 0, 0, 24
            # http: // www.ccgp.gov.cn / cggg / dfgg / index, 0, 0, 0, 0, 24
            for line in ftp.readlines():
                myUrllist = line.split(',')
                for i in range(int(begin), int(end) + 1):  # 每个list 存到 list中
                    if i == 0:
                        url = myUrllist[0] + ".htm"
                    else:
                        url = myUrllist[0] + "_" + str(i) + ".htm"

                    URL_inf = URLinformation(url, int(myUrllist[1]), 0.0,
                                             float(myUrllist[2]))  # 格式
                    URL_inf.Flag = 0
                    URL_inf.DeepNum = 1
                    URL_inf.domain = UrlUtil.getdomain(url)
                    MyUrl_SourceList.append(URL_inf)

        else:
            MyUrl_SourceList = []
            ftp = open(SOURCEURL_FILENAME, 'r')
            # http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=,&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2013%3A04%3A09&end_time=2014%3A04%3A08&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=,0,0,0,1,9068
            for line in ftp.readlines():
                myUrllist = line.split(',')
                for i in range(int(myUrllist[5]),
                               int(myUrllist[6])):  # 每个list 存到 list中
                    url = myUrllist[0] + str(i) + myUrllist[1]
                    URL_inf = URLinformation(url, int(myUrllist[2]), 0.0,
                                             float(myUrllist[4]))  # 格式
                    URL_inf.Flag = 0
                    URL_inf.DeepNum = 1
                    URL_inf.domain = UrlUtil.getdomain(url)
                    MyUrl_SourceList.append(URL_inf)
        ftp.close()
        return MyUrl_SourceList
예제 #6
0
 def __init__(self, processName, crontab):
     self.processName = processName
     self.crontab = crontab
     #Initial ipProxy and heads
     self.ipProxy = self.getIpPoolMethod()
     self.headersEngine = HeadersEngine()
     self.heads = self.headersEngine.getHeaders()
     #数据库模型和控制器
     self.URL_inf = URLinformation()
     self.__Sendcollection = "httpsearchccgpgovcn"
     self.mogodbControl = None
예제 #7
0
    def readSourceListRealTime():
        """
        构造实时爬取父URL
        bidType字段:招标类型
        page_index字段:页码
        start_time=2018%3A06%3A06字段:开始时间,2018年06月06日
        end_time=2018%3A06%3A06字段:开始时间,2018年06月06日
        """
        #http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=,
        # &end_time=,
        # &timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=

        #截取今天时间
        # nowTime = datetime.datetime.now().strftime('%Y-%m-%d').split('-')
        # strNowTime = nowTime[0]+'%3A'+nowTime[1]+'%3A'+nowTime[2]
        #老罗说要爬取一周的数据
        strNowTime = crawlerStartTime
        strEndTime = crawlerEndTime
        MyUrl_SourceList = []

        ftp = open(SOURCEURL_FILENAME, 'r')
        for line in ftp.readlines():
            myUrllist = line.split(',')
            # url = myUrllist[0]+strNowTime+myUrllist[1]+strNowTime+myUrllist[2]
            url = myUrllist[0] + strNowTime + myUrllist[
                1] + strEndTime + myUrllist[2]
            URL_inf = URLinformation(url.strip('\n'), int(0), 0.0,
                                     float(0))  # 格式
            URL_inf.Flag = 0
            URL_inf.DeepNum = 1
            URL_inf.domain = UrlUtil.getdomain(url)
            MyUrl_SourceList.append(URL_inf)
        else:
            ftp.close()

        return MyUrl_SourceList
예제 #8
0
    def getPageNumFromHome(self, dowloadData):
        """
        获取分页的页码URL
        """
        if dowloadData['soup'] is None:
            return []
        else:
            # Log.i(dowloadData['content'].decode('utf-8'))
            selector = etree.HTML(dowloadData['content'].decode('utf-8'))

            try:
                page = (int(
                    selector.xpath(
                        '//div[@class="vT_z"]/div[1]/div/p[1]/span[2]/text()')
                    [0]) // 20) + 3
            except:
                return []

            if page is None:
                return []
            parentURL_infor = []
            #随机数的方法判断倒序
            num = random.randint(3, 7)
            #存放url
            tempUrl = ''
            if (num % 2) == 0:
                for i in range(1, page):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    #TODO 这里拼接数据有问题
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor
            else:
                for i in range(page - 1, 0, -1):
                    #TODO字符串替换有问题
                    #'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time=2018%3A06%3A04&end_time=2018%3A06%3A11&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='
                    # x = 'page_index=' + str(i)
                    # dowloadData['Urlname'] = re.sub(r'page_index=(.)', x, dowloadData['Urlname'])
                    # dowloadData['Urlname'] = 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=' + str(i) \
                    #                          + '&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=0&dbselect=bidx&kw=&start_time='\
                    #                          +crawlerStartTime+'&end_time='+crawlerEndTime+'&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName='

                    x = 'page_index=' + str(i)
                    tempUrl = re.sub(r'page_index=(.)', x,
                                     dowloadData['Urlname'])
                    Log.i("parseUrl<<" + tempUrl)
                    urlChildInfo = URLinformation(
                        Urlname=tempUrl,
                        title=dowloadData['title'],
                        DeepNum=dowloadData['DeepNum'],
                        domain=dowloadData['domain'],
                        fatherUrl=dowloadData['fatherUrl'])
                    parentURL_infor.append(urlChildInfo)
                else:
                    if parentURL_infor is not None:
                        page = 0
                        return parentURL_infor
예제 #9
0
    def getChildrenLink(self, pageIndex):
        """
        获取子链接
        :return:
        """
        pattern = r'htt(p|ps):\/\/(\w+\.)+\w+/(\w+/)*'
        pattern = re.compile(pattern)
        # print("domain" + str(self.Url_inf.Urlname))
        Keyvalue = pattern.search(pageIndex['Urlname'])
        # Keyvalue  <_sre.SRE_Match object; span=(0, 26), match='http://search.ccgp.gov.cn/'>
        # print("Keyvalue  " + str(Keyvalue))
        # print(self.Url_inf.Urlname)
        if Keyvalue != None:
            Keyvalue = Keyvalue.group()
        else:
            Keyvalue = domain = urlparse(
                pageIndex['Urlname']).scheme + "://" + urlparse(
                    pageIndex['Urlname']).netloc

        domain = Keyvalue
        URL_infor = []
        URL_infor2 = []
        Links = []
        link2 = ''
        title = ''
        currentTime = ''
        total_title = ''

        # if self.Url_inf.soup == None:
        #     return []
        if USE_BXBLS is True:
            #分成两个业务
            # if self.Url_inf.Urlname.find("zygg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # elif self.Url_inf.Urlname.find("dfgg"):
            #     ul_content = self.Url_inf.soup.select(".c_list_bid")[0]
            # else:
            #     ul_content = self.Url_inf.soup
            if pageIndex['soup'] is None:
                return []
            else:
                urlInfoList = pageIndex['soup'].select(
                    ".vT-srch-result-list-bid")

            if urlInfoList is None:
                return []

            if urlInfoList:
                ul_content = urlInfoList[0]
            else:
                return []

            for li in ul_content.select("li"):
                link = li.select("a")[0]

                # emProvince = li.select("span")[2].get_text()
                spanProvince = li.select("span")[0]
                emProvince = spanProvince.select("a")[0].get_text()
                currentTime = time.time()

                try:
                    href2 = link['href']
                    total_title = link['title']
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)
                # else:
                if (
                        href2.startswith("/")
                ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                    # link2 = urljoin(self.Url_inf.Urlname, href2)
                    # print(str(link2))

                    # link2=self.Url_inf.Urlname+href2
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                elif (href2.startswith("../../..")):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('../../..',domain)
                elif href2.startswith(".."):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('..',domain)
                elif href2.startswith("./"):
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2.replace('./',domain+'/')
                elif 'http' in href2 and 'gov' in href2:
                    title = link.text.replace('\n',
                                              '').replace('\t',
                                                          '').replace(' ', '')
                    # link2=href2

                link2 = urljoin(pageIndex['Urlname'], href2)
                # print("link2 is :" + str(link2))
                #title不全的问题
                if title.find("...") > -1:
                    title = total_title

                title = title.strip('\r')
                myLinkUrl = URLinformation(Urlname=link2,
                                           title=title,
                                           DeepNum=pageIndex['DeepNum'] - 1,
                                           domain=pageIndex['domain'],
                                           fatherUrl=pageIndex['Urlname'],
                                           province=emProvince,
                                           LastTime=currentTime)
                URL_infor.append(myLinkUrl)

        else:
            for link in pageIndex['soup'].select("a"):
                # print(str(self.Url_inf.soup))
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201310/t20131008_3148218.htm" style="line-height:18px" target="_blank">
                #                                         南方科技大学等离子体技术基础仪器采购项目招标公告
                #                                     </a>
                # <a href="http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144053.htm" style="line-height:18px" target="_blank">
                #                                         2013年国家良种补贴牦牛、绵羊、奶牛冻精、肉牛冻精采购项目公开招标公告
                # print("children url is : "+ str(link))
                try:
                    href2 = link['href']  # 取出href对应的网站信息 具体信息如上
                    # print("href2:   " + str(href2))
                    # 取出的信息包含情况如下3种
                    # http://www.ccgp.gov.cn/cggg/dfgg/gkzb/201309/t20130926_3144362.htm
                    # javascript:void(0)
                    # #
                except KeyError:
                    pageIndex['soup'].select("a").remove(link)

                else:  # try正确运行 则运行else
                    if (
                            href2.startswith("/")
                    ):  # startswith() 方法用于检查字符串是否是以指定子字符串开头,如果是则返回 True,否则返回 False
                        # link2 = urljoin(self.Url_inf.Urlname, href2)
                        # print(str(link2))

                        # link2=self.Url_inf.Urlname+href2
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                    elif (href2.startswith("../../..")):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('../../..',domain)
                    elif href2.startswith(".."):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('..',domain)
                    elif href2.startswith("./"):
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2.replace('./',domain+'/')
                    elif 'http' in href2 and 'gov' in href2:
                        title = link.text.replace('\n', '').replace(
                            '\t', '').replace(' ', '')
                        # link2=href2

                    link2 = urljoin(pageIndex['Urlname'], href2)
                    # print("link2 is :" + str(link2))
                    myLinkUrl = URLinformation(Urlname=link2,
                                               title=title,
                                               DeepNum=pageIndex['DeepNum'] -
                                               1,
                                               domain=pageIndex['domain'],
                                               fatherUrl=pageIndex['Urlname'])
                    URL_infor.append(myLinkUrl)

        if USE_BXBLS is True:
            Links = list(set(URL_infor))
        else:
            #TODO 出现AttributeError: 'NoneType' object has no attribute 'select'
            for http in pageIndex['soup'].select('option'):  # 暂时未知有何内容
                try:
                    http2 = http['value']
                    # print("option" + str(http))
                except KeyError:
                    pageIndex['soup'].select("option").remove(http)
                else:
                    if "gov" in http2 and 'http' in http2:
                        myLinkUrl2 = URLinformation(
                            Urlname=http2,
                            title=http.text,
                            DeepNum=pageIndex['DeepNum'] - 1,
                            domain=pageIndex['domain'],
                            fatherUrl=pageIndex['Urlname'])
                        URL_infor2.append(myLinkUrl2)

            Links = list(set(URL_infor + URL_infor2))

        #TODO [2018-05-15 18:13:47.492] [INFO] [31469] [getChildrenLink(),ParseCCGPModule.py:129]: This url have 56  children urls1
        Log.i("This url have " + str(len(Links)) + "  children urls" +
              str(pageIndex['DeepNum']))
        return Links
예제 #10
0
 def __init__(self):
     self.producer = self.__setproducer()
     self.consumer = self.__setconsumer()
     self.URL_inf = URLinformation()
예제 #11
0
class localKafkaUrlinformation():
    def __init__(self):
        self.producer = self.__setproducer()
        self.consumer = self.__setconsumer()
        self.URL_inf = URLinformation()

    # def __del__(self):
    #     self.producer.close()
    #     self.consumer.close()

    def __setproducer(self):
        """
        返回生产父链接话题的生产者对象
        :return:
        """
        conf = localKafka_setting
        producer = KafkaProducer(bootstrap_servers=conf['bootstrap_servers'])
        return producer

    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = localKafka_setting
        try:
            consumer = KafkaConsumer(
                bootstrap_servers=conf['bootstrap_servers'],
                group_id=conf['consumer_id'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer

    # @AsycThread.async
    def producerUrl(self, strurl):
        """
        生产父链接
        :param strurl:
        """
        try:
            conf = localKafka_setting
            future = self.producer.send(conf['topic_name'],
                                        bytes(strurl, 'ASCII'))
            self.producer.flush()
            future.get()
        except KafkaError as e:
            #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理
            #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft
            self.producer.close()
            if self.producer is None:
                self.producer = self.__setproducer()
            Log.e(e + 'send message failed')
            pass

    def consumerurl(self):
        """
        消费父链接
        :param queueDictData:
        """
        conf = localKafka_setting
        self.consumer.subscribe((conf['topic_name']))
        # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了)
        for message in self.consumer:
            jsondata = str(message.value, "utf-8")
            Log.i(jsondata)
            # try:
            #     dictdata = json.loads(jsondata)
            # except Exception as e:
            #     Log.e(e + jsondata)
            #     continue

    # @AsycThread.async
    def producterUUID(self, strurl):
        """
        生产ggcp话题的uuid
        :param strurl:
        """
        try:
            conf = localKafka_setting
            #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs.
            future = self.producer.send(conf['topic_name_ccgp'],
                                        bytes(strurl, 'ASCII'))
            self.producer.flush()
            future.get()
        except KafkaError as e:
            self.producer.close()
            if self.producer is None:
                self.producer = self.__setproducer()
            Log.e(e + 'send message failed')
            pass

    def setURL_inf(self, dictdata):
        """
        url数据模型
        :param dictdata:
        """
        self.URL_inf.dict2class(dictdata)

    def getURL_inf(self):
        """
        url对外接口
        :return:
        """
        return self.URL_inf
예제 #12
0
class Downloader(multiprocessing.Process):
    def __init__(self, processName, pipeDictData):
        multiprocessing.Process.__init__(self)
        self.processName = processName
        self.pipeDictData = pipeDictData#任务url消息队列

        #数据库模型和控制器
        self.__Sendcollection = "Send_collection"
        self.URL_inf = URLinformation()
        # self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"])
        #这里还是设计成单个进程使用Mongo,不然TM给我警告
        self.mogodbControl = None

    def ayncDownloadTask(self, ipProxy,DictData):
        """
        异步执行爬虫业务
        :param ipProxy:
        :param DictData:
        :return:
        """
        # Log.i(DictData)
        global mutex_lock
        mutex_lock.acquire()  # 临界区开始,互斥的开始

        if self.URL_inf is None:
            self.URL_inf = URLinformation()

        # 源数据处理(实体类)
        self.URL_inf.dict2class(DictData)
        # # 查重业务逻辑
        # if self.__checkURL(self.URL_inf.Urlname):
        #     return
        # else:
        #     item = {"_id": self.__getMD5(self.URL_inf.Urlname)}
        #     self.mogodbControl.insert(item, self.__Sendcollection)

        # TODO 这里沿用王洋以前代码逻辑
        self.URL_inf = self.__getSoupAndDeepnumOrDown(ipProxy)
        if self.URL_inf is None or self.URL_inf.Urlname is None or self.URL_inf.content is None:
            mutex_lock.release()  # 临界区结束,互斥的结束
            return

        # 查重业务逻辑,uid=urlName+content
        # hashlib.md5((self.Url_inf.Urlname + self.Url_inf.content.decode("utf8", "ignore")).encode(
        #     "utf-8")).hexdigest()  # 使用md5编码
        #抛出AttributeError: 'NoneType' object has no attribute 'decode'异常
        # checkUrlUID = self.URL_inf.Urlname+self.URL_inf.title.decode("utf8","ignore")
        # checkUrlUID = urllib.parse.unquote(self.URL_inf.Urlname)
        # checkUrlUID = checkUrlUID.join(str(self.URL_inf.content))

        # checkUrlUID = hashlib.md5((urllib.parse.unquote(self.URL_inf.Urlname).join(str(self.URL_inf.content)).encode(
        #     "utf-8"))).hexdigest()  # 使用md5编码
        #
        # if self.__checkURL(checkUrlUID):
        #     mutex_lock.release()  # 临界区结束,互斥的结束
        #     return
        # else:
        #     item = {"_id": self.__getMD5(checkUrlUID)}
        #     self.mogodbControl.insert(item, self.__Sendcollection)

        #发送子链接
        self._sendChildUrl(self.URL_inf, mutex_lock)

        mutex_lock.release()  # 临界区结束,互斥的结束

    @AsycThread.async
    def _sendChildUrl(self,URL_inf, mutex_lock):
        # # 保存数据并提取子链接重新投入生产对应的话题
        KafkaOperator = kafkaUrlinformation()
        # TODO 这里用类管理不同网站的逻辑
        parseCCGPModule = ParserCCGPModule(URL_inf, KafkaOperator)
        ccgpChildrenLink = parseCCGPModule.getLinks()

        if ccgpChildrenLink is None:
            mutex_lock.release()  # 临界区结束,互斥的结束
            return

        for link in ccgpChildrenLink:
            #于浩说不要发父链接给他
            if link.DeepNum >= 0:
                Log.i("produce<<"+json.dumps(link.class2dict()))
                KafkaOperator.producerUrl(json.dumps(link.class2dict()))

    def __downlowdFile(self, url, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = url.split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split('filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(url)
                if r.url != url:
                    fileName = basename(urlsplit(r.url)[2])
            self.URL_inf.FileName = fileName

        _FileName = None
        if (self.URL_inf.FilePath):
            _FileName = str(self.URL_inf.FilePath) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:"+_FileName+"downLoaded")

    def __getSoupAndDeepnumOrDown(self, ipProxy, headers = []):
        """
        爬虫并简单解析子链接
        :param proxiesmmm:
        :param headers:
        """

        html = None#爬取网页所有内容
        ctifety = 0#解析子链接标志位
        Flag = 1#爬取完成标志位
        count = 0#空网页计算器

        #初始化你的头
        headers = HEADERS
        headersEngine = HeadersEngine()

        #奶奶的个大循环下载内容和文件,很JB耗时i/o操作
        while (Flag):
            url = self.URL_inf.Urlname#中间变量
            try:
                #如果有异常了赶紧换ip
                if count > 0:
                    ipProxy=self.getIpPoolMethod()
                protocol = 'https' if 'https' in ipProxy else 'http'
                proxiesmmm = {protocol: ipProxy}
                #Request Http请求网页,虽然我不喜欢这个库
                # req = requests.get(url, headers=headers, proxies=proxiesmmm, timeout=2)  # ,proxies=proxiesmmm,stream=True
                # req = requests.get(url, headers=headers, proxies=proxiesmmm)
                #解决HTTP超时异常 https://www.zhihu.com/question/52595659 拒绝默认的301/302重定向
                req = requests.get(url, headers=headers, allow_redirects=False, proxies=proxiesmmm, timeout=3)

                if req.status_code != 200:
                    return None

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    self.__downlowdFile(url=url, req=req)
                    self.URL_inf.Download = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    self.URL_inf.Download = 0
                    ctifety = 1
                    Flag = 0#该回大部队了
                else:
                    return None
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                if count > 3:
                    return None
                pass
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count+=1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                headers = headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                s = requests.session()
                s.keep_alive = False
                if count > 3:
                    return None
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                headers = headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                if count > 3:
                    return None
                pass


        if ctifety:
            self.URL_inf.content = html
            soup = BeautifulSoup(html, 'html.parser')#很棒棒的bs简单解析下
        else:
            soup = None

        self.URL_inf.soup = soup
        # Log.i(self.URL_inf.content.decode('utf-8'))
        return self.URL_inf#终于TM的爬完和简单解析了

    def __getMD5(self, url):
        """
        如果content为none,只编码url
        :return:
        """
        return hashlib.md5(url.encode("utf-8")).hexdigest()  # 使用md5编码

    def __checkURL(self, urlName):
        """
        查重函数
        :param urlName:
        :return:
        """
        item = {"_id": urlName}
        value = self.mogodbControl.findone(item, self.__Sendcollection)  # 查询到返回文档
        if value == None:  # 说明没找到
            return False
        else:
            return True  # 说明找到了

    def run_downloader(self, pipeDictData):
        """
        下载驱动器
        :param queueDictData:数据源(队列)
        """
        #改成一个ip下载同一个网站机制
        ipProxy = None
        while True:
            # Log.i('run_downloader in {0}'.format(time.ctime()))
            #获取数据源
            DictData = pipeDictData.recv()

            # 数据来了再创建数据库链接
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(Dbdata["host"], Dbdata["port"], Dbdata["db_name"], Dbdata["default_collection"])
            #数据来了再去获取ip
            if DictData is not None:
                # 获取免费ip
                if ipProxy is None:
                    ipProxy = self.getIpPoolMethod()
                # 异步执行下载
                self.ayncDownloadTask(ipProxy, DictData)
            # else:
            #     # 休眠n秒(从配置文件中读取)
            #     items = ConfigUtil.getItems('consumerScheduler')
            #     interval_min = items['interval_min']
            #     interval_max = items['interval_max']
            #     seconds = random.randint(int(interval_min), int(interval_max))
            #     Log.i('run_downloader sleep ' + str(seconds) + ' seconds')
            #     time.sleep(seconds)
            #     continue

    def getIpPoolMethod(self):
        """
        获取免费代理ip
        :return: 返回一个免费的ip代理
        """
        ipProxy = None
        if ipProxy is None:
            if USE_PROXY is True:
                #获取代理的时候保证能至少有一个IP
                proxyIpPool = getIpProxyPool()
                if proxyIpPool is not None:
                    ipProxy = proxyIpPool

                if ipProxy is None:
                    ipProxy = PROXY_NONE_URL
            else:
                ipProxy = PROXY_NONE_URL

        return ipProxy
        # ipProxy = None
        # if ipProxy is None:
        #     if USE_PROXY is True:
        #         #获取代理的时候保证能至少有一个IP
        #         proxyIpPool = getIpProxyPool()
        #         proxyIpPoolFromeRemote = getIpProxyPoolFromeRemote()
        #
        #         if proxyIpPool is None:
        #             ipProxy = proxyIpPoolFromeRemote
        #         else:
        #             ipProxy = proxyIpPool
        #
        #         if ipProxy is None:
        #             ipProxy = PROXY_NONE_URL
        #     else:
        #         ipProxy = PROXY_NONE_URL
        #
        # return ipProxy


    def run(self):
        '''
        获取免费IP代理进程执行,循环读取tasks
        :return:
        '''
        Log.i('Downloader.run() in {0}'.format(time.ctime()))

        p_list = list()

        downloaderRun = Process(target=self.run_downloader, args=(self.pipeDictData,))
        p_list.append(downloaderRun)

        for p in p_list:
            p.daemon = True
            p.start()
        for p in p_list:
            p.join()
예제 #13
0
class kafkaUrlinformation():
    def __init__(self):
        self.producer = self.__setproducer()
        self.consumer = self.__setconsumer()
        self.URL_inf = URLinformation()

    # def __del__(self):
    #     self.producer.close()
    #     self.consumer.close()

    def __setproducer(self):
        """
        返回生产父链接话题的生产者对象
        :return:
        """
        conf = kafka_setting
        context = ssl.create_default_context()
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.verify_mode = ssl.CERT_REQUIRED
        # context.check_hostname = True
        context.load_verify_locations(CACERT_FILENAME)

        producer = KafkaProducer(bootstrap_servers=conf['bootstrap_servers'],
                                 sasl_mechanism="PLAIN",
                                 ssl_context=context,
                                 security_protocol='SASL_SSL',
                                 api_version=(0, 10),
                                 retries=5,
                                 sasl_plain_username=conf['sasl_plain_username'],
                                 sasl_plain_password=conf['sasl_plain_password'])
        return producer

    def __setconsumer(self):
        """
        返回消费父链接话题的消费者对象
        :return:
        """
        conf = kafka_setting
        context = ssl.create_default_context()
        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
        context.verify_mode = ssl.CERT_REQUIRED
        # context.check_hostname = True
        context.load_verify_locations(CACERT_FILENAME)

        try:
            consumer = KafkaConsumer(bootstrap_servers=conf['bootstrap_servers'],
                                     group_id=conf['consumer_id'],
                                     sasl_mechanism="PLAIN",
                                     ssl_context=context,
                                     security_protocol='SASL_SSL',
                                     api_version=(0, 10),
                                     sasl_plain_username=conf['sasl_plain_username'],
                                     sasl_plain_password=conf['sasl_plain_password'])
        except KafkaError as e:
            Log.e(e + 'kafkaConsumer failed')

        return consumer

    # @AsycThread.async
    def producerUrl(self, strurl):
        """
        生产父链接
        :param strurl:
        """
        try:
            conf = kafka_setting
            future = self.producer.send(conf['topic_name'], bytes(strurl, 'ASCII'))
            self.producer.flush()
            future.get()
        except KafkaError as e:
            #TODO 异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs处理
            #https://stackoverflow.com/questions/48261501/kafka-errors-kafkatimeouterror-kafkatimeouterror-failed-to-update-metadata-aft
            self.producer.close()
            if self.producer is None:
                self.producer = self.__setproducer()
            Log.e(e+'send message failed')
            pass

    def consumerurl(self,pipeDictData):
        """
        消费父链接
        :param queueDictData:
        """
        conf = kafka_setting
        self.consumer.subscribe((conf['topic_name']))
        # TODO 这里阻塞是消费者连接超时,底层SDK主动调用断线重连API,监听数据回调(永久死循环,无JB优化了)
        for message in self.consumer:
            jsondata = str(message.value, "utf-8")
            # Log.i(jsondata)
            try:
                dictdata = json.loads(jsondata)
            except Exception as e:
                Log.e(e + jsondata)
                continue
            # self.setURL_inf(dictdata)
            #发送源数据,驱动下载器
            pipeDictData.send(dictdata)
            # queueDictData.put(dictdata)

    @AsycThread.async
    def producterUUID(self, strurl):
        """
        生产ggcp话题的uuid
        :param strurl:
        """
        try:
            conf = kafka_setting
            #TODO 抛出异常kafka.errors.KafkaTimeoutError: KafkaTimeoutError: Failed to update metadata after 60.0 secs.
            future = self.producer.send(conf['topic_name_ccgp'], bytes(strurl, 'ASCII'))
            self.producer.flush()
            future.get()
        except KafkaError as e:
            self.producer.close()
            if self.producer is None:
                self.producer = self.__setproducer()
            Log.e(e+'send message failed')
            pass


    def setURL_inf(self, dictdata):
        """
        url数据模型
        :param dictdata:
        """
        self.URL_inf.dict2class(dictdata)

    def getURL_inf(self):
        """
        url对外接口
        :return:
        """
        return self.URL_inf
class ComsumerChildUrl(multiprocessing.Process):
    def __init__(self, processName, pipeDictData):
        multiprocessing.Process.__init__(self)
        self.processName = processName
        #Initial ipProxy and heads
        self.ipProxy = self.getIpPoolMethod()
        self.headersEngine = HeadersEngine()
        self.heads = self.headersEngine.getHeaders()
        #数据库模型和控制器
        self.URL_inf = URLinformation()
        self.__Sendcollection = "httpsearchccgpgovcn"
        self.mogodbControl = None
        self.KafkaOperator = None
        self.pipeDictData = pipeDictData  # 任务url消息队列

    def downLoadHtml(self):
        """
        爬取并提取子链接
        :param urlInfor:
        """
        if self.ipProxy is None:
            self.ipProxy = self.getIpPoolMethod()
        if self.heads is None:
            self.heads = self.headersEngine.getHeaders()

        # {'DeepNum': 1, 'fatherUrl': None, 'Download': False, 'province': None, 'domain': 'http://search.ccgp.gov.cn',
        #  'FileName': None, 'Keyword': None, 'title': None, 'LastTime': 0.0, 'Flag': 0, 'soup': None, 'State': 0,
        #  'content': None,
        #  'Urlname': 'http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2018%3A06%3A07&end_time=2018%3A06%3A07&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=',
        #  'SleepTime': 0.0, 'FilePath': None}

        html = None  #爬取网页所有内容
        ctifety = 0  #解析子链接标志位
        Flag = 1  #爬取完成标志位
        count = 0  #空网页计算器
        while (Flag):
            try:
                if count > 1:
                    self.ipProxy = self.getIpPoolMethod()

                protocol = 'https' if 'https' in self.ipProxy else 'http'
                proxiesmmm = {protocol: self.ipProxy}

                req = requests.get(self.URL_inf.Urlname,
                                   headers=self.heads,
                                   allow_redirects=False,
                                   proxies=proxiesmmm,
                                   timeout=3)
                # 跳过验证反扒机制
                soup_validate = BeautifulSoup(req.text, 'lxml')
                if soup_validate.find(name='title').string == '安全验证':
                    self.ipProxy = self.getIpPoolMethod()
                    continue
                if req.status_code != 200:
                    self.ipProxy = self.getIpPoolMethod()
                    continue

                reqheaders = req.headers
                if "application" in reqheaders["Content-Type"]:
                    data = self.__downlowdFile(data=self.URL_inf, req=req)
                    data['Download'] = 1
                elif "text" in reqheaders["Content-Type"]:
                    html = req.content
                    self.URL_inf.Download = 0
                    ctifety = 1
                    Flag = 0  # 该回大部队了
                else:
                    continue
            except requests.exceptions.ConnectTimeout as e:
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                count += 1
                if html is None:
                    Flag = 1
            except (ConnectionError, Timeout) as e:
                Flag = 1
                count += 1
                Log.e("getSoupAndDeepnumOrDown HeadError -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #关闭多余的连接,出现了异常requests“Max retries exceeded with url” error
                requests.adapters.DEFAULT_RETRIES = 5
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass
            except Exception as e:
                Flag = 1
                count += 1
                #TODO 处理这种javascript:void(0)异常,忽略这种异常:https://www.zhihu.com/question/20626694?from=profile_question_card
                #TODO 处理这种无效头部异常 Invalid return character or leading space in header: Accept-Language
                #TODO 处理这种httpconnectionpool max retries  Failed to establish a new connection:
                Log.e("getSoupAndDeepnumOrDown Exception -> " + str(e))
                self.heads = self.headersEngine.getHeaders()
                #异常Max retries exceeded with url Error的处理
                s = requests.session()
                s.keep_alive = False
                count += 1
                if html is None:
                    Flag = 1
                pass

        if ctifety:
            self.URL_inf.content = html
            soup = BeautifulSoup(html, 'html.parser')  #很棒棒的bs简单解析下
        else:
            soup = None

        self.URL_inf.soup = soup
        Log.i(self.URL_inf.content.decode('utf-8'))
        return self.URL_inf

    def __downlowdFile(self, data, req):
        # http://stackoverflow.com/questions/862173/how-to-download-a-file-using-python-in-a-smarter-way
        """
        下载文件的代码逻辑.沿用王洋逻辑,还没调试
        :param url:
        :param req:
        """
        reqheaders = req.headers
        revealfn = data['Urlname'].split('/')[-1]

        if "." in revealfn[-6:]:
            fileName = revealfn
        else:
            if ('Content-Disposition' in reqheaders.keys()):
                fileName = reqheaders['Content-Disposition'].split(
                    'filename=')[1]
                fileName = fileName.replace('"', '').replace("'", "")
            else:
                r = urllib.request.urlopen(data['Urlname'])
                if r.url != data['Urlname']:
                    fileName = basename(urlsplit(r.url)[2])
            data['FileName'] = fileName

        _FileName = None
        if (data['FilePath']):
            _FileName = str(data['FilePath']) + fileName
        else:
            _FileName = fileName

        with open(_FileName, "wb") as donefile:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
                    donefile.write(chunk)

        Log.i("File:" + _FileName + "downLoaded")
        return data

    def get_md5(self, url, content):

        #如果content为none,只编码url
        """
        用MD5编码内容生成
        :return:
        """
        return hashlib.md5((urllib.parse.unquote(url).join(
            str(content)).encode("utf-8"))).hexdigest()  # 使用md5编码

    def __checkURL(self, urlName):
        """
        查重函数
        :param urlName:
        :return:
        """
        item = {"_id": urlName}
        value = self.mogodbControl.findone(item,
                                           self.__Sendcollection)  # 查询到返回文档
        if value == None:  # 说明没找到
            return False
        else:
            return True  # 说明找到了

    def getIpPoolMethod(self):
        """
        获取免费代理ip
        :return: 返回一个免费的ip代理
        """
        ipProxy = None
        if ipProxy is None:
            if USE_PROXY is True:
                #获取代理的时候保证能至少有一个IP
                proxyIpPool = getIpProxyPool()
                if proxyIpPool is not None:
                    ipProxy = proxyIpPool

                if ipProxy is None:
                    ipProxy = PROXY_NONE_URL
            else:
                ipProxy = PROXY_NONE_URL

        return ipProxy

    # @AsycThread.async
    def savedata(self, data):
        """
        保存数据到mongodb
        """
        #查重
        uuid = self.get_md5(data.Urlname, data.title)
        urlInfo = {
            "uuid": uuid,
            "url": data.Urlname,
            "title": data.title,
            "time": datetime.now().timestamp(),
            "content": data.content,
            "fatherUrl": data.fatherUrl,
            "province": data.province,
            "LastTime": data.LastTime
        }
        string = data.domain.replace('.', '').replace('/', '').replace(':', '')
        #查重 删除 替换
        if data.province is not None and data.content is not None:
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            #TODO 插入数据库有问题
            if value is None:
                # item = {"_id": uuid}
                self.mogodbControl.insert(urlInfo, self.__Sendcollection)
                # self.mogodbControl.ensure_index(item, self.__Sendcollection)
                self.KafkaOperator.producterUUID(
                    json.dumps({
                        "uuid": uuid,
                        'collection': string
                    }))

    def run(self):
        '''
        生产进程执行,每隔60*60*60*24秒,循环读取tasks
        :return:
        '''
        Log.i('ProducerUrl.run() in {0}'.format(time.ctime()))
        while True:
            #监听数据
            DictData = self.pipeDictData.recv()
            if DictData is None:
                continue
            #源数据处理(实体类)
            self.URL_inf.dict2class(DictData)
            #检查Mongo
            if self.mogodbControl is None:
                self.mogodbControl = Mongodb_Operator(
                    DbdataCCGPDFZB["host"], DbdataCCGPDFZB["port"],
                    DbdataCCGPDFZB["db_name"],
                    DbdataCCGPDFZB["default_collection"])
            #检查Kafka
            if self.KafkaOperator is None:
                self.KafkaOperator = localKafkaUrlinformation()
            #查重
            uuid = self.get_md5(self.URL_inf.Urlname, self.URL_inf.title)
            item = {"uuid": uuid}
            value = self.mogodbControl.findone(
                item, self.__Sendcollection)  # 查询到返回文档
            # #TODO 插入数据库有问题
            if value is not None:
                continue
            #获取首页内容
            self.URL_inf = self.downLoadHtml()
            if self.URL_inf is None:
                continue
            #异步保存数据
            self.savedata(self.URL_inf)

            # #休眠n秒(从配置文件中读取)
            items = ConfigUtil.getItems('consumerScheduler')
            interval_min = items['interval_min']
            interval_max = items['interval_max']
            seconds = random.randint(int(interval_min), int(interval_max))
            Log.i('StartProducerUrl sleep ' + str(seconds) + ' seconds')
            time.sleep(seconds)