예제 #1
0
    def readSourceListByParams(begin, end):
        '''
        构造URL配置文件
        :return: URL LIST对象
        '''
        if USE_BXBLS is True:
            MyUrl_SourceList = []
            ftp = open(SOURCEURL_FILENAME, 'r')
            # http: // www.ccgp.gov.cn / cggg / zygg / index, 0, 0, 0, 0, 24
            # http: // www.ccgp.gov.cn / cggg / dfgg / index, 0, 0, 0, 0, 24
            for line in ftp.readlines():
                myUrllist = line.split(',')
                for i in range(int(begin), int(end) + 1):  # 每个list 存到 list中
                    if i == 0:
                        url = myUrllist[0] + ".htm"
                    else:
                        url = myUrllist[0] + "_" + str(i) + ".htm"

                    URL_inf = URLinformation(url, int(myUrllist[1]), 0.0,
                                             float(myUrllist[2]))  # 格式
                    URL_inf.Flag = 0
                    URL_inf.DeepNum = 1
                    URL_inf.domain = UrlUtil.getdomain(url)
                    MyUrl_SourceList.append(URL_inf)

        else:
            MyUrl_SourceList = []
            ftp = open(SOURCEURL_FILENAME, 'r')
            # http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=,&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=2013%3A04%3A09&end_time=2014%3A04%3A08&timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=,0,0,0,1,9068
            for line in ftp.readlines():
                myUrllist = line.split(',')
                for i in range(int(myUrllist[5]),
                               int(myUrllist[6])):  # 每个list 存到 list中
                    url = myUrllist[0] + str(i) + myUrllist[1]
                    URL_inf = URLinformation(url, int(myUrllist[2]), 0.0,
                                             float(myUrllist[4]))  # 格式
                    URL_inf.Flag = 0
                    URL_inf.DeepNum = 1
                    URL_inf.domain = UrlUtil.getdomain(url)
                    MyUrl_SourceList.append(URL_inf)
        ftp.close()
        return MyUrl_SourceList
예제 #2
0
    def readTaskList():
        MyUrl_SourceList = []
        ftp = open(TASK_FILENAME, 'r')
        for line in ftp.readlines():
            line = line.strip("\n")
            if not UrlUtil.isLegalUrl(line):
                break
            URL_inf = URLinformation(line, 0, 0.0, 0)  # 格式
            URL_inf.Flag = 0
            URL_inf.DeepNum = 1
            URL_inf.domain = UrlUtil.getdomain(line)
            MyUrl_SourceList.append(URL_inf)

        ftp.close()
        return MyUrl_SourceList
예제 #3
0
    def readSourceListRealTime():
        """
        构造实时爬取父URL
        bidType字段:招标类型
        page_index字段:页码
        start_time=2018%3A06%3A06字段:开始时间,2018年06月06日
        end_time=2018%3A06%3A06字段:开始时间,2018年06月06日
        """
        #http://search.ccgp.gov.cn/bxsearch?searchtype=1&page_index=1&bidSort=0&buyerName=&projectId=&pinMu=0&bidType=1&dbselect=bidx&kw=&start_time=,
        # &end_time=,
        # &timeType=6&displayZone=&zoneId=&pppStatus=0&agentName=

        #截取今天时间
        # nowTime = datetime.datetime.now().strftime('%Y-%m-%d').split('-')
        # strNowTime = nowTime[0]+'%3A'+nowTime[1]+'%3A'+nowTime[2]
        #老罗说要爬取一周的数据
        strNowTime = crawlerStartTime
        strEndTime = crawlerEndTime
        MyUrl_SourceList = []

        ftp = open(SOURCEURL_FILENAME, 'r')
        for line in ftp.readlines():
            myUrllist = line.split(',')
            # url = myUrllist[0]+strNowTime+myUrllist[1]+strNowTime+myUrllist[2]
            url = myUrllist[0] + strNowTime + myUrllist[
                1] + strEndTime + myUrllist[2]
            URL_inf = URLinformation(url.strip('\n'), int(0), 0.0,
                                     float(0))  # 格式
            URL_inf.Flag = 0
            URL_inf.DeepNum = 1
            URL_inf.domain = UrlUtil.getdomain(url)
            MyUrl_SourceList.append(URL_inf)
        else:
            ftp.close()

        return MyUrl_SourceList