def findContactPageUrl(self,url):
        result=""
        if not url:
            return result
        if not url.startswith("http"):
            return result
        if FliterRegular.websiteFiltered(url):
            return
        print "Dealing the url to get the contact page:",url

        self.contactPageRegular=Inputs.contactPageRegular()
        shortUrllength=25

        htmlfile=self.getpage(url)
        try:
            soup=BeautifulSoup(htmlfile,'lxml')
        except BaseException:
            return ""

        for regular in self.contactPageRegular:
            contact=soup.find("a",{"href":re.compile(r".*?%s.*?" % regular,re.DOTALL|re.IGNORECASE)})
            if contact:
                if contact["href"].startswith("/"):
                    #print url+contact["href"]
                    return url+contact["href"]
                elif len(contact["href"])<shortUrllength:
                    #print url+"/"+contact["href"]
                    return url+"/"+contact["href"]
                else:
                    #print contact["href"]
                    return contact["href"]
        return ""
Пример #2
0
    def main(self):

        max,threadLimit,local,sleeptime=self.showScreenInfor()

        print "Program Begin: "
        keys=Inputs.readKeywords()
        #开始对每个关键词进行处理

        for word in keys:
            print "Now ,the word is:",word,".\nIt is in progress."
            keyword=word.strip()
            self.mainGetUrls(keyword,max,sleeptime,local)

        print "All finish."
Пример #3
0
 def mainGetUrls(self,word="led light bulbs",max=1000,sleeptime=0,local=0):
     countries=[]
     if local==1:
         countries=Inputs.getCountries()
     if (not max)or max=="0":
         max=1000
     else:
         max=int(max)*10
     if local==1:
         for country in countries:
             print "now dealing country:"+country
             self.max=max
             self.country=country
             self.word=word
             keyword={
                 "q":word,
                 "cr":"country"+country
             }
             for i in range(0,self.max,10):
                 self.page=i
                 print "page:",i/10,"item:",i
                 url=self.originurl % (urllib.urlencode(keyword),str(self.page))
                 htmlfile=self.getpage(url)
                 self.findTitleAndUrl(htmlfile)
                 self.saveList()
                 if (not sleeptime)or sleeptime=="0":
                     sleeptime=5
                 if sleeptime:
                     print "waiting for :"+str(sleeptime)+" second,then continue"
                     sleep(int(sleeptime))
     else:
         self.max=max
         self.country="UK"
         self.word=word
         keyword={
             "q":word
         }
         for i in range(0,self.max,10):
             self.page=i
             print "page:",i/10,"item:",i
             url=self.originurl % (urllib.urlencode(keyword),str(self.page))
             htmlfile=self.getpage(url)
             self.findTitleAndUrl(htmlfile)
             self.saveList()
             if (not sleeptime)or sleeptime=="0":
                 sleeptime=5
             if sleeptime:
                 print "waiting for :"+str(sleeptime)+" second,then continue"
                 sleep(int(sleeptime))
    def main(self,titles=("looking_for","page","location"),allInformationInList="1"):

        max,threadlimit,local=self.showScreenInfor()

        print "Program Begin: "
        keys=Inputs.readKeywords()
        #开始对每个关键词进行处理

        #开启多线程
        threads=self.startThreadPool(threadlimit)

        for word in keys:
            print "Now ,the category and word are",word,",they are in progress."
            self.category=word.split(":")[0]
            keyword=word.split(":")[1]
            self.mainGetUrls(keyword,max,local,allInformationInList,titles)
        if allInformationInList!='1':
            self.mainMiningUrlDB(threadlimit)

        self.queue.join()

        print "All finish!!! \n END。"
    def mainGetUrls(self,word="led light bulbs",max=0,local=0,allInformationInList="1",titles=("looking_for","page","location")):
        self.max=max
        self.word=word
        self.page=1
        self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,"","","0")
        if  local==1:
            locals=Inputs.getLocals()
            if locals:
                for l in locals:
                    print "Finding location: "+l
                    self.printTotalResults(max,l,titles)

                    print " dealing every page."
                    self.page=1
                    for p in range(1,self.max+1):
                        self.page=p
                        self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,titles[2],l,"1")
                        url=self.goalurl
                        print "Now dealing Location: ",l
                        print "Dealing page: ",p
                        if allInformationInList=="1":
                            #全部信息都在列表页中
                            self.queue.put((url,self.word,self.category,self.country))
                            print "page: ",str(p)," information has got."
                        else:
                        #全部信息不都在列表页中,需要进入获取
                            self.getPageUrls(url)


                    if allInformationInList!="1":
                        print "Succeed in getting all pages,ready to write to DB."
                        self.saveUrlList()
                        self.contacturls=[]
                    #print "休息一分钟后继续获取下一个地区"
                    #sleep(60)

                print "Success!"


        else:
            self.printTotalResults(max,titles=titles)

            print " dealing every page."
            self.page=1
            for p in range(1,self.max+1):
                self.page=p
                self.goalurl=self.formUrl(titles[0],self.word,titles[1],self.page,"","","0")
                url=self.goalurl
                print "Dealing page: ",p

                if allInformationInList=="1":
                    #全部信息都在列表页中
                    self.queue.put((url,self.word,self.category,self.country))

                else:
                    #全部信息不都在列表页中,需要进入获取
                    self.getPageUrls(url)

            if allInformationInList!="1":
                print "Succeed in getting all pages,ready to write to DB."
                self.saveUrlList()
                self.contacturls=[]

            print "Success!"